diff options
Diffstat (limited to 'fs/fuse')
-rw-r--r-- | fs/fuse/Kconfig | 11 | ||||
-rw-r--r-- | fs/fuse/Makefile | 5 | ||||
-rw-r--r-- | fs/fuse/acl.c | 10 | ||||
-rw-r--r-- | fs/fuse/control.c | 12 | ||||
-rw-r--r-- | fs/fuse/cuse.c | 4 | ||||
-rw-r--r-- | fs/fuse/dax.c | 1 | ||||
-rw-r--r-- | fs/fuse/dev.c | 384 | ||||
-rw-r--r-- | fs/fuse/dir.c | 230 | ||||
-rw-r--r-- | fs/fuse/file.c | 737 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 203 | ||||
-rw-r--r-- | fs/fuse/fuse_trace.h | 132 | ||||
-rw-r--r-- | fs/fuse/inode.c | 113 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 60 | ||||
-rw-r--r-- | fs/fuse/iomode.c | 276 | ||||
-rw-r--r-- | fs/fuse/passthrough.c | 351 | ||||
-rw-r--r-- | fs/fuse/readdir.c | 10 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 470 | ||||
-rw-r--r-- | fs/fuse/xattr.c | 2 |
18 files changed, 2324 insertions, 687 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 038ed0b9aaa5..8674dbfbe59d 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -52,3 +52,14 @@ config FUSE_DAX If you want to allow mounting a Virtio Filesystem with the "dax" option, answer Y. + +config FUSE_PASSTHROUGH + bool "FUSE passthrough operations support" + default y + depends on FUSE_FS + select FS_STACK + help + This allows bypassing FUSE server by mapping specific FUSE operations + to be performed directly on a backing file. + + If you want to allow passthrough operations, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 0c48b35c058d..ce0ff7a9007b 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -3,11 +3,16 @@ # Makefile for the FUSE filesystem. # +# Needed for trace events +ccflags-y = -I$(src) + obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 3d192b80a561..8f484b105f13 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -12,7 +12,6 @@ #include <linux/posix_acl_xattr.h> static struct posix_acl *__fuse_get_acl(struct fuse_conn *fc, - struct mnt_idmap *idmap, struct inode *inode, int type, bool rcu) { int size; @@ -74,7 +73,7 @@ struct posix_acl *fuse_get_acl(struct mnt_idmap *idmap, if (fuse_no_acl(fc, inode)) return ERR_PTR(-EOPNOTSUPP); - return __fuse_get_acl(fc, idmap, inode, type, false); + return __fuse_get_acl(fc, inode, type, false); } struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) @@ -90,8 +89,7 @@ struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu) */ if (!fc->posix_acl) return NULL; - - return __fuse_get_acl(fc, &nop_mnt_idmap, inode, type, rcu); + return __fuse_get_acl(fc, inode, type, rcu); } int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, @@ -146,8 +144,8 @@ int fuse_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, * be stripped. */ if (fc->posix_acl && - !vfsgid_in_group_p(i_gid_into_vfsgid(&nop_mnt_idmap, inode)) && - !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) + !in_group_or_capable(idmap, inode, + i_gid_into_vfsgid(idmap, inode))) extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index ab62e4624256..2a730d88cc3b 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; - down_read(&fc->killsb); - spin_lock(&fc->bg_lock); - fc->congestion_threshold = val; - spin_unlock(&fc->bg_lock); - up_read(&fc->killsb); + WRITE_ONCE(fc->congestion_threshold, val); fuse_conn_put(fc); out: return ret; @@ -187,27 +183,23 @@ out: static const struct file_operations fuse_ctl_abort_ops = { .open = nonseekable_open, .write = fuse_conn_abort_write, - .llseek = no_llseek, }; static const struct file_operations fuse_ctl_waiting_ops = { .open = nonseekable_open, .read = fuse_conn_waiting_read, - .llseek = no_llseek, }; static const struct file_operations fuse_conn_max_background_ops = { .open = nonseekable_open, .read = fuse_conn_max_background_read, .write = fuse_conn_max_background_write, - .llseek = no_llseek, }; static const struct file_operations fuse_conn_congestion_threshold_ops = { .open = nonseekable_open, .read = fuse_conn_congestion_threshold_read, .write = fuse_conn_congestion_threshold_write, - .llseek = no_llseek, }; static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, @@ -235,7 +227,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b6cad106c37e..0b2da7b7e2ad 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -310,6 +310,10 @@ struct cuse_init_args { /** * cuse_process_init_reply - finish initializing CUSE channel * + * @fm: The fuse mount information containing the CUSE connection. + * @args: The arguments passed to the init reply. + * @error: The error code signifying if any error occurred during the process. + * * This function creates the character device and sets up all the * required data structures for it. Please read the comment at the * top of this file for high level overview. diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 12ef91d170bb..7faf1af59d5d 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -681,7 +681,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, 0, 0, fuse_wait_dax_page(inode)); } -/* dmap_end == 0 leads to unmapping of whole file */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8573d79ef29c..1f64ae6d7a69 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -22,6 +22,9 @@ #include <linux/splice.h> #include <linux/sched.h> +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" + MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -31,6 +34,8 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +static void end_requests(struct list_head *head); + static struct fuse_dev *fuse_get_dev(struct file *file) { /* @@ -103,11 +108,17 @@ static void fuse_drop_waiting(struct fuse_conn *fc) static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) +static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, + struct fuse_mount *fm, + bool for_background) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; + bool no_idmap = !fm->sb || (fm->sb->s_iflags & SB_I_NOIDMAP); + kuid_t fsuid; + kgid_t fsgid; int err; + atomic_inc(&fc->num_waiting); if (fuse_block_alloc(fc, for_background)) { @@ -135,19 +146,32 @@ static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) goto out; } - req->in.h.uid = from_kuid(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); - if (unlikely(req->in.h.uid == ((uid_t)-1) || - req->in.h.gid == ((gid_t)-1))) { + /* + * Keep the old behavior when idmappings support was not + * declared by a FUSE server. + * + * For those FUSE servers who support idmapped mounts, + * we send UID/GID only along with "inode creation" + * fuse requests, otherwise idmap == &invalid_mnt_idmap and + * req->in.h.{u,g}id will be equal to FUSE_INVALID_UIDGID. + */ + fsuid = no_idmap ? current_fsuid() : mapped_fsuid(idmap, fc->user_ns); + fsgid = no_idmap ? current_fsgid() : mapped_fsgid(idmap, fc->user_ns); + req->in.h.uid = from_kuid(fc->user_ns, fsuid); + req->in.h.gid = from_kgid(fc->user_ns, fsgid); + + if (no_idmap && unlikely(req->in.h.uid == ((uid_t)-1) || + req->in.h.gid == ((gid_t)-1))) { fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } + return req; out: @@ -192,11 +216,22 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) } EXPORT_SYMBOL_GPL(fuse_len_args); -u64 fuse_get_unique(struct fuse_iqueue *fiq) +static u64 fuse_get_unique_locked(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } + +u64 fuse_get_unique(struct fuse_iqueue *fiq) +{ + u64 ret; + + spin_lock(&fiq->lock); + ret = fuse_get_unique_locked(fiq); + spin_unlock(&fiq->lock); + + return ret; +} EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) @@ -215,22 +250,70 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + fuse_dev_wake_and_unlock(fiq); + } else { + kfree(forget); + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + /* + * Pairs with smp_mb() implied by test_and_set_bit() + * from fuse_request_end(). + */ + smp_mb(); + if (test_bit(FR_FINISHED, &req->flags)) { + list_del_init(&req->intr_entry); + spin_unlock(&fiq->lock); + } else { + fuse_dev_wake_and_unlock(fiq); + } + } else { + spin_unlock(&fiq->lock); + } +} + +static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + spin_lock(&fiq->lock); + if (fiq->connected) { + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + list_add_tail(&req->list, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); + req->out.h.error = -ENOTCONN; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); + } +} + const struct fuse_iqueue_ops fuse_dev_fiq_ops = { - .wake_forget_and_unlock = fuse_dev_wake_and_unlock, - .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, - .wake_pending_and_unlock = fuse_dev_wake_and_unlock, + .send_forget = fuse_dev_queue_forget, + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_dev_queue_req, }; EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); -static void queue_request_and_unlock(struct fuse_iqueue *fiq, - struct fuse_req *req) -__releases(fiq->lock) +static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - list_add_tail(&req->list, &fiq->pending); - fiq->ops->wake_pending_and_unlock(fiq); + trace_fuse_request_send(req); + fiq->ops->send_req(fiq, req); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -241,15 +324,7 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->lock); - if (fiq->connected) { - fiq->forget_list_tail->next = forget; - fiq->forget_list_tail = forget; - fiq->ops->wake_forget_and_unlock(fiq); - } else { - kfree(forget); - spin_unlock(&fiq->lock); - } + fiq->ops->send_forget(fiq, forget); } static void flush_bg_queue(struct fuse_conn *fc) @@ -263,9 +338,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->lock); - req->in.h.unique = fuse_get_unique(fiq); - queue_request_and_unlock(fiq, req); + fuse_send_one(fiq, req); } } @@ -286,6 +359,7 @@ void fuse_request_end(struct fuse_req *req) if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + trace_fuse_request_end(req); /* * test_and_set_bit() implies smp_mb() between bit * changing and below FR_INTERRUPTED check. Pairs with @@ -335,29 +409,12 @@ static int queue_interrupt(struct fuse_req *req) { struct fuse_iqueue *fiq = &req->fm->fc->iq; - spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ - if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->lock); + if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) return -EINVAL; - } - if (list_empty(&req->intr_entry)) { - list_add_tail(&req->intr_entry, &fiq->interrupts); - /* - * Pairs with smp_mb() implied by test_and_set_bit() - * from fuse_request_end(). - */ - smp_mb(); - if (test_bit(FR_FINISHED, &req->flags)) { - list_del_init(&req->intr_entry); - spin_unlock(&fiq->lock); - return 0; - } - fiq->ops->wake_interrupt_and_unlock(fiq); - } else { - spin_unlock(&fiq->lock); - } + fiq->ops->send_interrupt(fiq, req); + return 0; } @@ -412,21 +469,15 @@ static void __fuse_request_send(struct fuse_req *req) struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->lock); - if (!fiq->connected) { - spin_unlock(&fiq->lock); - req->out.h.error = -ENOTCONN; - } else { - req->in.h.unique = fuse_get_unique(fiq); - /* acquire extra reference, since request is still needed - after fuse_request_end() */ - __fuse_get_request(req); - queue_request_and_unlock(fiq, req); - request_wait_answer(req); - /* Pairs with smp_wmb() in fuse_request_end() */ - smp_rmb(); - } + /* acquire extra reference, since request is still needed after + fuse_request_end() */ + __fuse_get_request(req); + fuse_send_one(fiq, req); + + request_wait_answer(req); + /* Pairs with smp_wmb() in fuse_request_end() */ + smp_rmb(); } static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) @@ -466,8 +517,14 @@ static void fuse_force_creds(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + if (!req->fm->sb || req->fm->sb->s_iflags & SB_I_NOIDMAP) { + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + } else { + req->in.h.uid = FUSE_INVALID_UIDGID; + req->in.h.gid = FUSE_INVALID_UIDGID; + } + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } @@ -482,7 +539,9 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) { struct fuse_conn *fc = fm->fc; struct fuse_req *req; @@ -499,7 +558,7 @@ ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, false); + req = fuse_get_req(idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -560,7 +619,7 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fm, true); + req = fuse_get_req(&invalid_mnt_idmap, fm, true); if (IS_ERR(req)) return PTR_ERR(req); } @@ -581,9 +640,8 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, { struct fuse_req *req; struct fuse_iqueue *fiq = &fm->fc->iq; - int err = 0; - req = fuse_get_req(fm, false); + req = fuse_get_req(&invalid_mnt_idmap, fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -592,16 +650,9 @@ static int fuse_simple_notify_reply(struct fuse_mount *fm, fuse_args_to_req(req, args); - spin_lock(&fiq->lock); - if (fiq->connected) { - queue_request_and_unlock(fiq, req); - } else { - err = -ENODEV; - spin_unlock(&fiq->lock); - fuse_put_request(req); - } + fuse_send_one(fiq, req); - return err; + return 0; } /* @@ -773,7 +824,6 @@ static int fuse_check_folio(struct folio *folio) (folio->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | - 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | 1 << PG_workingset | @@ -818,9 +868,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) newfolio = page_folio(buf->page); - if (!folio_test_uptodate(newfolio)) - folio_mark_uptodate(newfolio); - + folio_clear_uptodate(newfolio); folio_clear_mappedtodisk(newfolio); if (fuse_check_folio(newfolio) != 0) @@ -1076,9 +1124,9 @@ __releases(fiq->lock) return err ? err : reqsize; } -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp) +static struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1097,7 +1145,6 @@ struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, return head; } -EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, @@ -1112,7 +1159,7 @@ __releases(fiq->lock) struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1143,7 +1190,7 @@ __releases(fiq->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fiq), + .unique = fuse_get_unique_locked(fiq), .len = sizeof(ih) + sizeof(arg), }; @@ -1777,6 +1824,69 @@ copy_finish: return err; } +/* + * Resending all processing queue requests. + * + * During a FUSE daemon panics and failover, it is possible for some inflight + * requests to be lost and never returned. As a result, applications awaiting + * replies would become stuck forever. To address this, we can use notification + * to trigger resending of these pending requests to the FUSE daemon, ensuring + * they are properly processed again. + * + * Please note that this strategy is applicable only to idempotent requests or + * if the FUSE daemon takes careful measures to avoid processing duplicated + * non-idempotent requests. + */ +static void fuse_resend(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + struct fuse_req *req, *next; + struct fuse_iqueue *fiq = &fc->iq; + LIST_HEAD(to_queue); + unsigned int i; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + list_splice_tail_init(&fpq->processing[i], &to_queue); + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + list_for_each_entry_safe(req, next, &to_queue, list) { + set_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); + /* mark the request as resend request */ + req->in.h.unique |= FUSE_UNIQUE_RESEND; + } + + spin_lock(&fiq->lock); + if (!fiq->connected) { + spin_unlock(&fiq->lock); + list_for_each_entry(req, &to_queue, list) + clear_bit(FR_PENDING, &req->flags); + end_requests(&to_queue); + return; + } + /* iq and pq requests are both oldest to newest */ + list_splice(&to_queue, &fiq->pending); + fuse_dev_wake_and_unlock(fiq); +} + +static int fuse_notify_resend(struct fuse_conn *fc) +{ + fuse_resend(fc); + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1802,6 +1912,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_DELETE: return fuse_notify_delete(fc, size, cs); + case FUSE_NOTIFY_RESEND: + return fuse_notify_resend(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2253,49 +2366,96 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) return 0; } -static long fuse_dev_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) +static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) { int res; int oldfd; struct fuse_dev *fud = NULL; struct fd f; + if (get_user(oldfd, argp)) + return -EFAULT; + + f = fdget(oldfd); + if (!fd_file(f)) + return -EINVAL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (fd_file(f)->f_op == file->f_op) + fud = fuse_get_dev(fd_file(f)); + + res = -EINVAL; + if (fud) { + mutex_lock(&fuse_mutex); + res = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + + fdput(f); + return res; +} + +static long fuse_dev_ioctl_backing_open(struct file *file, + struct fuse_backing_map __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_backing_map map; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (copy_from_user(&map, argp, sizeof(map))) + return -EFAULT; + + return fuse_backing_open(fud->fc, &map); +} + +static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) +{ + struct fuse_dev *fud = fuse_get_dev(file); + int backing_id; + + if (!fud) + return -EPERM; + + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + return -EOPNOTSUPP; + + if (get_user(backing_id, argp)) + return -EFAULT; + + return fuse_backing_close(fud->fc, backing_id); +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + switch (cmd) { case FUSE_DEV_IOC_CLONE: - if (get_user(oldfd, (__u32 __user *)arg)) - return -EFAULT; + return fuse_dev_ioctl_clone(file, argp); - f = fdget(oldfd); - if (!f.file) - return -EINVAL; + case FUSE_DEV_IOC_BACKING_OPEN: + return fuse_dev_ioctl_backing_open(file, argp); + + case FUSE_DEV_IOC_BACKING_CLOSE: + return fuse_dev_ioctl_backing_close(file, argp); - /* - * Check against file->f_op because CUSE - * uses the same ioctl handler. - */ - if (f.file->f_op == file->f_op) - fud = fuse_get_dev(f.file); - - res = -EINVAL; - if (fud) { - mutex_lock(&fuse_mutex); - res = fuse_device_clone(fud->fc, file); - mutex_unlock(&fuse_mutex); - } - fdput(f); - break; default: - res = -ENOTTY; - break; + return -ENOTTY; } - return res; } const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, - .llseek = no_llseek, .read_iter = fuse_dev_read, .splice_read = fuse_dev_splice_read, .write_iter = fuse_dev_write, diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 95f9913a3537..ce7324d0d9ed 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -545,17 +545,21 @@ static u32 fuse_ext_size(size_t size) /* * This adds just a single supplementary group that matches the parent's group. */ -static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) +static int get_create_supp_group(struct mnt_idmap *idmap, + struct inode *dir, + struct fuse_in_arg *ext) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_ext_header *xh; struct fuse_supp_groups *sg; kgid_t kgid = dir->i_gid; + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid); gid_t parent_gid = from_kgid(fc->user_ns, kgid); + u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); - if (parent_gid == (gid_t) -1 || gid_eq(kgid, current_fsgid()) || - !in_group_p(kgid)) + if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) || + !vfsgid_in_group_p(vfsgid)) return 0; xh = extend_arg(ext, sg_len); @@ -572,7 +576,8 @@ static int get_create_supp_group(struct inode *dir, struct fuse_in_arg *ext) return 0; } -static int get_create_ext(struct fuse_args *args, +static int get_create_ext(struct mnt_idmap *idmap, + struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -583,7 +588,7 @@ static int get_create_ext(struct fuse_args *args, if (fc->init_security) err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) - err = get_create_supp_group(dir, &ext); + err = get_create_supp_group(idmap, dir, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); @@ -609,9 +614,9 @@ static void free_ext_value(struct fuse_args *args) * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ -static int fuse_create_open(struct inode *dir, struct dentry *entry, - struct file *file, unsigned int flags, - umode_t mode, u32 opcode) +static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, struct file *file, + unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; @@ -619,7 +624,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; - struct fuse_open_out outopen; + struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; @@ -634,7 +639,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; @@ -663,14 +668,16 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; - args.out_args[1].size = sizeof(outopen); - args.out_args[1].value = &outopen; + /* Store outarg for fuse_finish_open() */ + outopenp = &ff->args->open_outarg; + args.out_args[1].size = sizeof(*outopenp); + args.out_args[1].value = outopenp; - err = get_create_ext(&args, dir, entry, mode); + err = get_create_ext(idmap, &args, dir, entry, mode); if (err) goto out_free_ff; - err = fuse_simple_request(fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; @@ -680,9 +687,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, fuse_invalid_attr(&outentry.attr)) goto out_free_ff; - ff->fh = outopen.fh; + ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; - ff->open_flags = outopen.open_flags; + ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0); if (!inode) { @@ -696,13 +703,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); - err = finish_open(file, entry, generic_file_open); + err = generic_file_open(inode, file); + if (!err) { + file->private_data = ff; + err = finish_open(file, entry, fuse_finish_open); + } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { - file->private_data = ff; - fuse_finish_open(inode, file); if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) @@ -725,6 +734,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, umode_t mode) { int err; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; @@ -749,7 +759,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, if (fc->no_create) goto mknod; - err = fuse_create_open(dir, entry, file, flags, mode, FUSE_CREATE); + err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; @@ -760,7 +770,7 @@ out_dput: return err; mknod: - err = fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -770,9 +780,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, - struct inode *dir, struct dentry *entry, - umode_t mode) +static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -794,12 +804,12 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { - err = get_create_ext(args, dir, entry, mode); + err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } - err = fuse_simple_request(fm, args); + err = fuse_simple_idmap_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; @@ -860,13 +870,13 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, mode); + return create_new_entry(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(&nop_mnt_idmap, dir, entry, mode, 0); + return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, @@ -878,7 +888,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, if (fc->no_tmpfile) return -EOPNOTSUPP; - err = fuse_create_open(dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); + err = fuse_create_open(idmap, dir, file->f_path.dentry, file, + file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; @@ -905,7 +916,7 @@ static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fm, &args, dir, entry, S_IFDIR); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, @@ -921,7 +932,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fm, &args, dir, entry, S_IFLNK); + return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1015,7 +1026,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, +static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { @@ -1036,7 +1047,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fm, &args); + err = fuse_simple_idmap_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); @@ -1082,7 +1093,8 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, if (fc->no_rename2 || fc->minor < 23) return -EINVAL; - err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap, + olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { @@ -1090,7 +1102,7 @@ static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, err = -EINVAL; } } else { - err = fuse_rename_common(olddir, oldent, newdir, newent, 0, + err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } @@ -1115,27 +1127,33 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); + err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + err = -EPERM; return err; } -static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, - struct kstat *stat) +static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, + struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); + vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, + make_kuid(fc->user_ns, attr->uid)); + vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, + make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; - stat->uid = make_kuid(fc->user_ns, attr->uid); - stat->gid = make_kgid(fc->user_ns, attr->gid); + stat->uid = vfsuid_into_kuid(vfsuid); + stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; @@ -1174,8 +1192,8 @@ static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) attr->blksize = sx->blksize; } -static int fuse_do_statx(struct inode *inode, struct file *file, - struct kstat *stat) +static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; @@ -1228,15 +1246,15 @@ static int fuse_do_statx(struct inode *inode, struct file *file, stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); - fuse_fillattr(inode, &attr, stat); + fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } -static int fuse_do_getattr(struct inode *inode, struct kstat *stat, - struct file *file) +static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, + struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; @@ -1275,15 +1293,15 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, ATTR_TIMEOUT(&outarg), attr_version); if (stat) - fuse_fillattr(inode, &outarg.attr, stat); + fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } -static int fuse_update_get_attr(struct inode *inode, struct file *file, - struct kstat *stat, u32 request_mask, - unsigned int flags) +static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, + struct file *file, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1314,17 +1332,17 @@ retry: forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { - err = fuse_do_statx(inode, file, stat); + err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; err = 0; goto retry; } } else { - err = fuse_do_getattr(inode, stat, file); + err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { @@ -1338,7 +1356,7 @@ retry: int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - return fuse_update_get_attr(inode, file, NULL, mask, 0); + return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, @@ -1458,6 +1476,14 @@ static int fuse_access(struct inode *inode, int mask) BUG_ON(mask & MAY_NOT_BLOCK); + /* + * We should not send FUSE_ACCESS to the userspace + * when idmapped mounts are enabled as for this case + * we have fc->default_permissions = 1 and access + * permission checks are done on the kernel side. + */ + WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP)); + if (fm->fc->no_access) return 0; @@ -1482,7 +1508,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) return -ECHILD; forget_all_cached_acls(inode); - return fuse_do_getattr(inode, NULL, NULL); + return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* @@ -1490,7 +1516,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission - * modell. + * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this @@ -1530,7 +1556,7 @@ static int fuse_permission(struct mnt_idmap *idmap, } if (fc->default_permissions) { - err = generic_permission(&nop_mnt_idmap, inode, mask); + err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1538,7 +1564,7 @@ static int fuse_permission(struct mnt_idmap *idmap, if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(&nop_mnt_idmap, + err = generic_permission(idmap, inode, mask); } @@ -1608,7 +1634,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) @@ -1635,7 +1661,32 @@ out_err: static int fuse_dir_open(struct inode *inode, struct file *file) { - return fuse_open_common(inode, file, true); + struct fuse_mount *fm = get_fuse_mount(inode); + int err; + + if (fuse_is_bad(inode)) + return -EIO; + + err = generic_file_open(inode, file); + if (err) + return err; + + err = fuse_do_open(fm, get_node_id(inode), file, true); + if (!err) { + struct fuse_file *ff = file->private_data; + + /* + * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for + * directories for backward compatibility, though it's unlikely + * to be useful. + */ + if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) + nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } + + return err; } static int fuse_dir_release(struct inode *inode, struct file *file) @@ -1711,17 +1762,29 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) return true; } -static void iattr_to_fattr(struct fuse_conn *fc, struct iattr *iattr, - struct fuse_setattr_in *arg, bool trust_local_cmtime) +static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, + struct iattr *iattr, struct fuse_setattr_in *arg, + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; - if (ivalid & ATTR_UID) - arg->valid |= FATTR_UID, arg->uid = from_kuid(fc->user_ns, iattr->ia_uid); - if (ivalid & ATTR_GID) - arg->valid |= FATTR_GID, arg->gid = from_kgid(fc->user_ns, iattr->ia_gid); + + if (ivalid & ATTR_UID) { + kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); + + arg->valid |= FATTR_UID; + arg->uid = from_kuid(fc->user_ns, fsuid); + } + + if (ivalid & ATTR_GID) { + kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); + + arg->valid |= FATTR_GID; + arg->gid = from_kgid(fc->user_ns, fsgid); + } + if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { @@ -1817,12 +1880,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; - inarg.mtime = inode->i_mtime.tv_sec; - inarg.mtimensec = inode->i_mtime.tv_nsec; + inarg.mtime = inode_get_mtime_sec(inode); + inarg.mtimensec = inode_get_mtime_nsec(inode); if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; - inarg.ctime = inode_get_ctime(inode).tv_sec; - inarg.ctimensec = inode_get_ctime(inode).tv_nsec; + inarg.ctime = inode_get_ctime_sec(inode); + inarg.ctimensec = inode_get_ctime_nsec(inode); } if (ff) { inarg.valid |= FATTR_FH; @@ -1841,8 +1904,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file) +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); @@ -1858,11 +1921,12 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(&nop_mnt_idmap, dentry, attr); + err = setattr_prepare(idmap, dentry, attr); if (err) return err; @@ -1875,7 +1939,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; @@ -1921,7 +1985,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(fc, attr, &inarg, trust_local_cmtime); + iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1942,6 +2006,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -1961,12 +2027,20 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode)); @@ -2038,7 +2112,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ - ret = fuse_do_getattr(inode, NULL, file); + ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; @@ -2056,7 +2130,7 @@ static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, if (!attr->ia_valid) return 0; - ret = fuse_do_setattr(entry, attr, file); + ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in @@ -2095,7 +2169,7 @@ static int fuse_getattr(struct mnt_idmap *idmap, return -EACCES; } - return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); + return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ceb9f7d23038..f597f7e68e50 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,8 @@ #include <linux/uio.h> #include <linux/fs.h> #include <linux/filelock.h> +#include <linux/splice.h> +#include <linux/task_io_accounting_ops.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -49,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, return fuse_simple_request(fm, &args); } -struct fuse_release_args { - struct fuse_args args; - struct fuse_release_in inarg; - struct inode *inode; -}; - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release) { struct fuse_file *ff; @@ -64,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) return NULL; ff->fm = fm; - ff->release_args = kzalloc(sizeof(*ff->release_args), - GFP_KERNEL_ACCOUNT); - if (!ff->release_args) { - kfree(ff); - return NULL; + if (release) { + ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT); + if (!ff->args) { + kfree(ff); + return NULL; + } } INIT_LIST_HEAD(&ff->write_entry); - mutex_init(&ff->readdir.lock); refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -84,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) void fuse_file_free(struct fuse_file *ff) { - kfree(ff->release_args); - mutex_destroy(&ff->readdir.lock); + kfree(ff->args); kfree(ff); } @@ -104,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, kfree(ra); } -static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) +static void fuse_file_put(struct fuse_file *ff, bool sync) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_args *args = &ff->release_args->args; + struct fuse_release_args *ra = &ff->args->release_args; + struct fuse_args *args = (ra ? &ra->args : NULL); - if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { - /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fm, args, 0); + if (ra && ra->inode) + fuse_file_io_release(ff, ra->inode); + + if (!args) { + /* Do nothing when server does not implement 'open' */ } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -131,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; + bool open = isdir ? !fc->no_opendir : !fc->no_open; - ff = fuse_file_alloc(fm); + ff = fuse_file_alloc(fm, open); if (!ff) return ERR_PTR(-ENOMEM); ff->fh = 0; /* Default for no-open */ ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); - if (isdir ? !fc->no_opendir : !fc->no_open) { - struct fuse_open_out outarg; + if (open) { + /* Store outarg for fuse_finish_open() */ + struct fuse_open_out *outargp = &ff->args->open_outarg; int err; - err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); + err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp); if (!err) { - ff->fh = outarg.fh; - ff->open_flags = outarg.open_flags; - + ff->fh = outargp->fh; + ff->open_flags = outargp->open_flags; } else if (err != -ENOSYS) { fuse_file_free(ff); return ERR_PTR(err); } else { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; if (isdir) fc->no_opendir = 1; else @@ -194,40 +196,50 @@ static void fuse_link_write_file(struct file *file) spin_unlock(&fi->lock); } -void fuse_finish_open(struct inode *inode, struct file *file) +int fuse_finish_open(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); + int err; + + err = fuse_file_io_open(file, inode); + if (err) + return err; if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); - i_size_write(inode, 0); - spin_unlock(&fi->lock); - file_update_time(file); - fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); + + return 0; } -int fuse_open_common(struct inode *inode, struct file *file, bool isdir) +static void fuse_truncate_update_attr(struct inode *inode, struct file *file) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + i_size_write(inode, 0); + spin_unlock(&fi->lock); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); +} + +static int fuse_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = fm->fc; + struct fuse_file *ff; int err; - bool is_wb_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && - fc->writeback_cache; - bool dax_truncate = (file->f_flags & O_TRUNC) && - fc->atomic_o_trunc && FUSE_IS_DAX(inode); + bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc; + bool is_wb_truncate = is_truncate && fc->writeback_cache; + bool dax_truncate = is_truncate && FUSE_IS_DAX(inode); if (fuse_is_bad(inode)) return -EIO; @@ -241,7 +253,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -249,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (is_wb_truncate || dax_truncate) fuse_set_nowrite(inode); - err = fuse_do_open(fm, get_node_id(inode), file, isdir); - if (!err) - fuse_finish_open(inode, file); + err = fuse_do_open(fm, get_node_id(inode), file, false); + if (!err) { + ff = file->private_data; + err = fuse_finish_open(inode, file); + if (err) + fuse_sync_release(fi, ff, file->f_flags); + else if (is_truncate) + fuse_truncate_update_attr(inode, file); + } if (is_wb_truncate || dax_truncate) fuse_release_nowrite(inode); if (!err) { - struct fuse_file *ff = file->private_data; - - if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + if (is_truncate) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); @@ -273,10 +289,13 @@ out_inode_unlock: } static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, - unsigned int flags, int opcode) + unsigned int flags, int opcode, bool sync) { struct fuse_conn *fc = ff->fm->fc; - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; + + if (fuse_file_passthrough(ff)) + fuse_passthrough_release(ff, fuse_inode_backing(fi)); /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -291,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); + if (!ra) + return; + + /* ff->args was used for open outarg */ + memset(ff->args, 0, sizeof(*ff->args)); ra->inarg.fh = ff->fh; ra->inarg.flags = flags; ra->args.in_numargs = 1; @@ -300,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, ra->args.nodeid = ff->nodeid; ra->args.force = true; ra->args.nocreds = true; + + /* + * Hold inode until release is finished. + * From fuse_sync_release() the refcount is 1 and everything's + * synchronous, so we are fine with not doing igrab() here. + */ + ra->inode = sync ? NULL : igrab(&fi->inode); } void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir) { struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_release_args *ra = ff->release_args; + struct fuse_release_args *ra = &ff->args->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; - fuse_prepare_release(fi, ff, open_flags, opcode); + fuse_prepare_release(fi, ff, open_flags, opcode, false); - if (ff->flock) { + if (ra && ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); } - /* Hold inode until release is finished */ - ra->inode = igrab(inode); /* * Normally this will send the RELEASE request, however if @@ -327,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fm->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy); } void fuse_release_common(struct file *file, bool isdir) @@ -336,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir) (fl_owner_t) file, isdir); } -static int fuse_open(struct inode *inode, struct file *file) -{ - return fuse_open_common(inode, file, false); -} - static int fuse_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = get_fuse_conn(inode); @@ -362,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags) { WARN_ON(refcount_read(&ff->count) > 1); - fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); - /* - * iput(NULL) is a no-op and since the refcount is 1 and everything's - * synchronous, we are fine with not doing igrab() here" - */ - fuse_file_put(ff, true, false); + fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true); + fuse_file_put(ff, true); } EXPORT_SYMBOL_GPL(fuse_sync_release); @@ -428,9 +448,6 @@ static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, /* * Check if any page in a range is under writeback - * - * This is currently done by walking the list of writepage requests - * for the inode, which can be pretty inefficient. */ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, pgoff_t idx_to) @@ -438,6 +455,9 @@ static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, struct fuse_inode *fi = get_fuse_inode(inode); bool found; + if (RB_EMPTY_ROOT(&fi->writepages)) + return false; + spin_lock(&fi->lock); found = fuse_find_writeback(fi, idx_from, idx_to); spin_unlock(&fi->lock); @@ -625,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_args_pages *ap, +static void fuse_release_user_pages(struct fuse_args_pages *ap, ssize_t nres, bool should_dirty) { unsigned int i; @@ -633,8 +653,12 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap, for (i = 0; i < ap->num_pages; i++) { if (should_dirty) set_page_dirty_lock(ap->pages[i]); - put_page(ap->pages[i]); + if (ap->args.is_pinned) + unpin_user_page(ap->pages[i]); } + + if (nres > 0 && ap->args.invalidate_vmap) + invalidate_kernel_vmap_range(ap->args.vmap_base, nres); } static void fuse_io_release(struct kref *kref) @@ -733,25 +757,29 @@ static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - - fuse_release_user_pages(&ia->ap, io->should_dirty); + size_t nres; if (err) { /* Nothing */ } else if (io->write) { if (ia->write.out.size > ia->write.in.size) { err = -EIO; - } else if (ia->write.in.size != ia->write.out.size) { - pos = ia->write.in.offset - io->offset + - ia->write.out.size; + } else { + nres = ia->write.out.size; + if (ia->write.in.size != ia->write.out.size) + pos = ia->write.in.offset - io->offset + + ia->write.out.size; } } else { u32 outsize = args->out_args[0].size; + nres = outsize; if (ia->read.in.size != outsize) pos = ia->read.in.offset - io->offset + outsize; } + fuse_release_user_pages(&ia->ap, err ?: nres, io->should_dirty); + fuse_aio_complete(io, err, pos); fuse_io_free(ia); } @@ -914,17 +942,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, } for (i = 0; i < ap->num_pages; i++) { - struct page *page = ap->pages[i]; + struct folio *folio = page_folio(ap->pages[i]); - if (!err) - SetPageUptodate(page); - else - SetPageError(page); - unlock_page(page); - put_page(page); + folio_end_read(folio, !err); + folio_put(folio); } if (ia->ff) - fuse_file_put(ia->ff, false, false); + fuse_file_put(ia->ff, false); fuse_io_free(ia); } @@ -1298,13 +1322,94 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii) return res; } +static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); +} + +/* + * @return true if an exclusive lock for direct IO writes is needed + */ +static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* Server side has to advise that it supports parallel dio writes. */ + if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES)) + return true; + + /* + * Append will need to know the eventual EOF - always needs an + * exclusive lock. + */ + if (iocb->ki_flags & IOCB_APPEND) + return true; + + /* shared locks are not allowed with parallel page cache IO */ + if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state)) + return true; + + /* Parallel dio beyond EOF is not supported, at least for now. */ + if (fuse_io_past_eof(iocb, from)) + return true; + + return false; +} + +static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from, + bool *exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + *exclusive = fuse_dio_wr_exclusive_lock(iocb, from); + if (*exclusive) { + inode_lock(inode); + } else { + inode_lock_shared(inode); + /* + * New parallal dio allowed only if inode is not in caching + * mode and denies new opens in caching mode. This check + * should be performed only after taking shared inode lock. + * Previous past eof check was without inode lock and might + * have raced, so check it again. + */ + if (fuse_io_past_eof(iocb, from) || + fuse_inode_uncached_io_start(fi, NULL) != 0) { + inode_unlock_shared(inode); + inode_lock(inode); + *exclusive = true; + } + } +} + +static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_inode *fi = get_fuse_inode(inode); + + if (exclusive) { + inode_unlock(inode); + } else { + /* Allow opens in caching mode after last parallel dio end */ + fuse_inode_uncached_io_end(fi); + inode_unlock_shared(inode); + } +} + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct mnt_idmap *idmap = file_mnt_idmap(file); struct address_space *mapping = file->f_mapping; ssize_t written = 0; struct inode *inode = mapping->host; - ssize_t err; + ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -1315,7 +1420,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(&nop_mnt_idmap, + setattr_should_drop_suidgid(idmap, file_inode(file))) { goto writethrough; } @@ -1326,10 +1431,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) writethrough: inode_lock(inode); - err = generic_write_checks(iocb, from); + err = count = generic_write_checks(iocb, from); if (err <= 0) goto out; + task_io_account_write(count); + err = file_remove_privs(file); if (err) goto out; @@ -1368,33 +1475,49 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, size_t *nbytesp, int write, - unsigned int max_pages) + unsigned int max_pages, + bool use_pages_for_kvec_io) { + bool flush_or_invalidate = false; size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; - /* Special case for kernel I/O: can copy directly into the buffer */ + /* Special case for kernel I/O: can copy directly into the buffer. + * However if the implementation of fuse_conn requires pages instead of + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. + */ if (iov_iter_is_kvec(ii)) { - unsigned long user_addr = fuse_get_user_addr(ii); - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); + void *user_addr = (void *)fuse_get_user_addr(ii); - if (write) - ap->args.in_args[1].value = (void *) user_addr; - else - ap->args.out_args[0].value = (void *) user_addr; + if (!use_pages_for_kvec_io) { + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); - iov_iter_advance(ii, frag_size); - *nbytesp = frag_size; - return 0; + if (write) + ap->args.in_args[1].value = user_addr; + else + ap->args.out_args[0].value = user_addr; + + iov_iter_advance(ii, frag_size); + *nbytesp = frag_size; + return 0; + } + + if (is_vmalloc_addr(user_addr)) { + ap->args.vmap_base = user_addr; + flush_or_invalidate = true; + } } while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages], - *nbytesp - nbytes, - max_pages - ap->num_pages, - &start); + struct page **pt_pages; + + pt_pages = &ap->pages[ap->num_pages]; + ret = iov_iter_extract_pages(ii, &pt_pages, + *nbytesp - nbytes, + max_pages - ap->num_pages, + 0, &start); if (ret < 0) break; @@ -1411,6 +1534,11 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + if (write && flush_or_invalidate) + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); + + ap->args.invalidate_vmap = !write && flush_or_invalidate; + ap->args.is_pinned = iov_iter_extract_will_pin(ii); ap->args.user_pages = true; if (write) ap->args.in_pages = true; @@ -1478,7 +1606,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, size_t nbytes = min(count, nmax); err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, - max_pages); + max_pages, fc->use_pages_for_kvec_io); if (err && !nbytes) break; @@ -1492,7 +1620,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, } if (!io->async || nres < 0) { - fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); fuse_io_free(ia); } ia = NULL; @@ -1557,51 +1685,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) return res; } -static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, - struct iov_iter *iter) -{ - struct inode *inode = file_inode(iocb->ki_filp); - - return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); -} - static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); - struct file *file = iocb->ki_filp; - struct fuse_file *ff = file->private_data; struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; - bool exclusive_lock = - !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || - get_fuse_conn(inode)->direct_io_allow_mmap || - iocb->ki_flags & IOCB_APPEND || - fuse_direct_write_extending_i_size(iocb, from); - - /* - * Take exclusive lock if - * - Parallel direct writes are disabled - a user space decision - * - Parallel direct writes are enabled and i_size is being extended. - * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP). - * This might not be needed at all, but needs further investigation. - */ - if (exclusive_lock) - inode_lock(inode); - else { - inode_lock_shared(inode); - - /* A race with truncate might have come up as the decision for - * the lock type was done without holding the lock, check again. - */ - if (fuse_direct_write_extending_i_size(iocb, from)) { - inode_unlock_shared(inode); - inode_lock(inode); - exclusive_lock = true; - } - } + bool exclusive; + fuse_dio_lock(iocb, from, &exclusive); res = generic_write_checks(iocb, from); if (res > 0) { + task_io_account_write(res); if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { res = fuse_direct_IO(iocb, from); } else { @@ -1610,10 +1704,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) fuse_write_update_attr(inode, iocb->ki_pos, res); } } - if (exclusive_lock) - inode_unlock(inode); - else - inode_unlock_shared(inode); + fuse_dio_unlock(iocb, exclusive); return res; } @@ -1630,10 +1721,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (FUSE_IS_DAX(inode)) return fuse_dax_read_iter(iocb, to); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) - return fuse_cache_read_iter(iocb, to); - else + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) return fuse_direct_read_iter(iocb, to); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_read_iter(iocb, to); + else + return fuse_cache_read_iter(iocb, to); } static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) @@ -1648,10 +1742,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (FUSE_IS_DAX(inode)) return fuse_dax_write_iter(iocb, from); - if (!(ff->open_flags & FOPEN_DIRECT_IO)) + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (ff->open_flags & FOPEN_DIRECT_IO) + return fuse_direct_write_iter(iocb, from); + else if (fuse_file_passthrough(ff)) + return fuse_passthrough_write_iter(iocb, from); + else return fuse_cache_write_iter(iocb, from); +} + +static ssize_t fuse_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_read(in, ppos, pipe, len, flags); else - return fuse_direct_write_iter(iocb, from); + return filemap_splice_read(in, ppos, pipe, len, flags); +} + +static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + + /* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */ + if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO)) + return fuse_passthrough_splice_write(pipe, out, ppos, len, flags); + else + return iter_file_splice_write(pipe, out, ppos, len, flags); } static void fuse_writepage_free(struct fuse_writepage_args *wpa) @@ -1665,27 +1787,31 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); - if (wpa->ia.ff) - fuse_file_put(wpa->ia.ff, false, false); + fuse_file_put(wpa->ia.ff, false); kfree(ap->pages); kfree(wpa); } -static void fuse_writepage_finish(struct fuse_mount *fm, - struct fuse_writepage_args *wpa) +static void fuse_writepage_finish_stat(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + dec_node_page_state(page, NR_WRITEBACK_TEMP); + wb_writeout_inc(&bdi->wb); +} + +static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_pages; i++) { - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); - } + for (i = 0; i < ap->num_pages; i++) + fuse_writepage_finish_stat(inode, ap->pages[i]); + wake_up(&fi->page_waitq); } @@ -1732,19 +1858,14 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); /* After rb_erase() aux request list is private */ for (aux = wpa->next; aux; aux = next) { - struct backing_dev_info *bdi = inode_to_bdi(aux->inode); - next = aux->next; aux->next = NULL; - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(aux->ia.ap.pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(aux->inode, aux->ia.ap.pages[0]); fuse_writepage_free(aux); } @@ -1839,7 +1960,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, wpa->next = next->next; next->next = NULL; - next->ia.ff = fuse_file_get(wpa->ia.ff); tree_insert(&fi->writepages, next); /* @@ -1868,7 +1988,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fm, wpa); + fuse_writepage_finish(wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -1914,7 +2034,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) - fuse_file_put(ff, false, false); + fuse_file_put(ff, false); return err; } @@ -1952,49 +2072,77 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, rcu_read_unlock(); } -static int fuse_writepage_locked(struct page *page) +static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, + struct folio *tmp_folio, uint32_t page_index) { - struct address_space *mapping = page->mapping; - struct inode *inode = mapping->host; + struct inode *inode = folio->mapping->host; + struct fuse_args_pages *ap = &wpa->ia.ap; + + folio_copy(tmp_folio, folio); + + ap->pages[page_index] = &tmp_folio->page; + ap->descs[page_index].offset = 0; + ap->descs[page_index].length = PAGE_SIZE; + + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); + inc_node_page_state(&tmp_folio->page, NR_WRITEBACK_TEMP); +} + +static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + struct fuse_file *ff) +{ + struct inode *inode = folio->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct page *tmp_page; - int error = -ENOMEM; - - set_page_writeback(page); wpa = fuse_writepage_args_alloc(); if (!wpa) - goto err; + return NULL; + + fuse_writepage_add_to_bucket(fc, wpa); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->inode = inode; + wpa->ia.ff = ff; + ap = &wpa->ia.ap; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) - goto err_free; + return wpa; +} + +static int fuse_writepage_locked(struct folio *folio) +{ + struct address_space *mapping = folio->mapping; + struct inode *inode = mapping->host; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + struct folio *tmp_folio; + struct fuse_file *ff; + int error = -ENOMEM; + + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) + goto err; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fi); - if (!wpa->ia.ff) + ff = fuse_write_file_get(fi); + if (!ff) goto err_nofile; - fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); + wpa = fuse_writepage_args_setup(folio, ff); + error = -ENOMEM; + if (!wpa) + goto err_writepage_args; - copy_highpage(tmp_page, page); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; + ap = &wpa->ia.ap; ap->num_pages = 1; - ap->pages[0] = tmp_page; - ap->descs[0].offset = 0; - ap->descs[0].length = PAGE_SIZE; - ap->args.end = fuse_writepage_end; - wpa->inode = inode; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); + folio_start_writeback(folio); + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); spin_lock(&fi->lock); tree_insert(&fi->writepages, wpa); @@ -2002,48 +2150,19 @@ static int fuse_writepage_locked(struct page *page) fuse_flush_writepages(inode); spin_unlock(&fi->lock); - end_page_writeback(page); + folio_end_writeback(folio); return 0; +err_writepage_args: + fuse_file_put(ff, false); err_nofile: - __free_page(tmp_page); -err_free: - kfree(wpa); + folio_put(tmp_folio); err: - mapping_set_error(page->mapping, error); - end_page_writeback(page); + mapping_set_error(folio->mapping, error); return error; } -static int fuse_writepage(struct page *page, struct writeback_control *wbc) -{ - struct fuse_conn *fc = get_fuse_conn(page->mapping->host); - int err; - - if (fuse_page_is_writeback(page->mapping->host, page->index)) { - /* - * ->writepages() should be called for sync() and friends. We - * should only get here on direct reclaim and then we are - * allowed to skip a page which is already in flight - */ - WARN_ON(wbc->sync_mode == WB_SYNC_ALL); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - if (wbc->sync_mode == WB_SYNC_NONE && - fc->num_background >= fc->congestion_threshold) - return AOP_WRITEPAGE_ACTIVATE; - - err = fuse_writepage_locked(page); - unlock_page(page); - - return err; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; @@ -2086,7 +2205,6 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) int num_pages = wpa->ia.ap.num_pages; int i; - wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); @@ -2141,11 +2259,7 @@ static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); + fuse_writepage_finish_stat(new_wpa->inode, new_ap->pages[0]); fuse_writepage_free(new_wpa); } @@ -2195,7 +2309,7 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct page *tmp_page; + struct folio *tmp_folio; int err; if (!data->ff) { @@ -2211,8 +2325,8 @@ static int fuse_writepages_fill(struct folio *folio, } err = -ENOMEM; - tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (!tmp_page) + tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); + if (!tmp_folio) goto out_unlock; /* @@ -2230,35 +2344,20 @@ static int fuse_writepages_fill(struct folio *folio, */ if (data->wpa == NULL) { err = -ENOMEM; - wpa = fuse_writepage_args_alloc(); + wpa = fuse_writepage_args_setup(folio, data->ff); if (!wpa) { - __free_page(tmp_page); + folio_put(tmp_folio); goto out_unlock; } - fuse_writepage_add_to_bucket(fc, wpa); - + fuse_file_get(wpa->ia.ff); data->max_pages = 1; - ap = &wpa->ia.ap; - fuse_write_args_fill(&wpa->ia, data->ff, folio_pos(folio), 0); - wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; - wpa->next = NULL; - ap->args.in_pages = true; - ap->args.end = fuse_writepage_end; - ap->num_pages = 0; - wpa->inode = inode; } folio_start_writeback(folio); - copy_highpage(tmp_page, &folio->page); - ap->pages[ap->num_pages] = tmp_page; - ap->descs[ap->num_pages].offset = 0; - ap->descs[ap->num_pages].length = PAGE_SIZE; + fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_pages); data->orig_pages[ap->num_pages] = &folio->page; - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); - err = 0; if (data->wpa) { /* @@ -2312,7 +2411,7 @@ static int fuse_writepages(struct address_space *mapping, fuse_writepages_send(&data); } if (data.ff) - fuse_file_put(data.ff, false, false); + fuse_file_put(data.ff, false); kfree(data.orig_pages); out: @@ -2324,76 +2423,77 @@ out: * but how to implement it without killing performance need more thinking. */ static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep, void **fsdata) + loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { pgoff_t index = pos >> PAGE_SHIFT; struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct page *page; + struct folio *folio; loff_t fsize; int err = -ENOMEM; WARN_ON(!fc->writeback_cache); - page = grab_cache_page_write_begin(mapping, index); - if (!page) + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, page->index); + fuse_wait_on_page_writeback(mapping->host, folio->index); - if (PageUptodate(page) || len == PAGE_SIZE) + if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* - * Check if the start this page comes after the end of file, in which - * case the readpage can be optimized away. + * Check if the start of this folio comes after the end of file, + * in which case the readpage can be optimized away. */ fsize = i_size_read(mapping->host); - if (fsize <= (pos & PAGE_MASK)) { - size_t off = pos & ~PAGE_MASK; + if (fsize <= folio_pos(folio)) { + size_t off = offset_in_folio(folio, pos); if (off) - zero_user_segment(page, 0, off); + folio_zero_segment(folio, 0, off); goto success; } - err = fuse_do_readpage(file, page); + err = fuse_do_readpage(file, &folio->page); if (err) goto cleanup; success: - *pagep = page; + *foliop = folio; return 0; cleanup: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); error: return err; } static int fuse_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct folio *folio, void *fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ if (!copied) goto unlock; pos += copied; - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* Zero any unwritten bytes at the end of the page */ size_t endoff = pos & ~PAGE_MASK; if (endoff) - zero_user_segment(page, endoff, PAGE_SIZE); - SetPageUptodate(page); + folio_zero_segment(folio, endoff, PAGE_SIZE); + folio_mark_uptodate(folio); } if (pos > inode->i_size) i_size_write(inode, pos); - set_page_dirty(page); + folio_mark_dirty(folio); unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -2406,7 +2506,7 @@ static int fuse_launder_folio(struct folio *folio) /* Serialize with pending writeback for the same page */ fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(&folio->page); + err = fuse_writepage_locked(folio); if (!err) fuse_wait_on_page_writeback(inode, folio->index); } @@ -2467,11 +2567,27 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; + struct inode *inode = file_inode(file); + int rc; /* DAX mmap is superior to direct_io mmap */ - if (FUSE_IS_DAX(file_inode(file))) + if (FUSE_IS_DAX(inode)) return fuse_dax_mmap(file, vma); + /* + * If inode is in passthrough io mode, because it has some file open + * in passthrough mode, either mmap to backing file or fail mmap, + * because mixing cached mmap and passthrough io mode is not allowed. + */ + if (fuse_file_passthrough(ff)) + return fuse_passthrough_mmap(file, vma); + else if (fuse_inode_backing(get_fuse_inode(inode))) + return -ENODEV; + + /* + * FOPEN_DIRECT_IO handling is special compared to O_DIRECT, + * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP. + */ if (ff->open_flags & FOPEN_DIRECT_IO) { /* * Can't provide the coherency needed for MAP_SHARED @@ -2486,6 +2602,17 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) /* MAP_PRIVATE */ return generic_file_mmap(file, vma); } + + /* + * First mmap of direct_io file enters caching inode io mode. + * Also waits for parallel dio writers to go into serial mode + * (exclusive instead of shared lock). + * After first mmap, the inode stays in caching io mode until + * the direct_io file release. + */ + rc = fuse_file_cached_io_open(inode, ff); + if (rc) + return rc; } if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) @@ -2518,14 +2645,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); + fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; default: return -EIO; } - fl->fl_type = ffl->type; + fl->c.flc_type = ffl->type; return 0; } @@ -2539,10 +2666,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, memset(inarg, 0, sizeof(*inarg)); inarg->fh = ff->fh; - inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); + inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner); inarg->lk.start = fl->fl_start; inarg->lk.end = fl->fl_end; - inarg->lk.type = fl->fl_type; + inarg->lk.type = fl->c.flc_type; inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; @@ -2579,8 +2706,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; - int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; + struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL; pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; @@ -2589,10 +2716,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return -ENOLCK; } - /* Unlock on close is handled by the flush method */ - if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) - return 0; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fm, &args); @@ -2880,7 +3003,7 @@ static void fuse_do_truncate(struct file *file) attr.ia_file = file; attr.ia_valid |= ATTR_FILE; - fuse_do_setattr(file_dentry(file), &attr, file); + fuse_do_setattr(file_mnt_idmap(file), file_dentry(file), &attr, file); } static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) @@ -3023,7 +3146,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } @@ -3205,8 +3328,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, len, flags); if (ret == -EOPNOTSUPP || ret == -EXDEV) - ret = generic_copy_file_range(src_file, src_off, dst_file, - dst_off, len, flags); + ret = splice_copy_file_range(src_file, src_off, dst_file, + dst_off, len); return ret; } @@ -3222,8 +3345,8 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, - .splice_read = filemap_splice_read, - .splice_write = iter_file_splice_write, + .splice_read = fuse_splice_read, + .splice_write = fuse_splice_write, .unlocked_ioctl = fuse_file_ioctl, .compat_ioctl = fuse_file_compat_ioctl, .poll = fuse_file_poll, @@ -3234,10 +3357,10 @@ static const struct file_operations fuse_file_operations = { static const struct address_space_operations fuse_file_aops = { .read_folio = fuse_read_folio, .readahead = fuse_readahead, - .writepage = fuse_writepage, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, .write_begin = fuse_write_begin, @@ -3254,7 +3377,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; + fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); + init_waitqueue_head(&fi->direct_io_waitq); fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 4ce1a6fdc94f..28cf319c1c25 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -76,6 +76,16 @@ struct fuse_submount_lookup { struct fuse_forget_link *forget; }; +/** Container for data related to mapping to backing file */ +struct fuse_backing { + struct file *file; + struct cred *cred; + + /** refcount */ + refcount_t count; + struct rcu_head rcu; +}; + /** FUSE inode */ struct fuse_inode { /** Inode data */ @@ -111,7 +121,7 @@ struct fuse_inode { u64 attr_version; union { - /* Write related fields (regular file only) */ + /* read/write io cache (regular file only) */ struct { /* Files usable in writepage. Protected by fi->lock */ struct list_head write_files; @@ -123,9 +133,15 @@ struct fuse_inode { * (FUSE_NOWRITE) means more writes are blocked */ int writectr; + /** Number of files/maps using page cache */ + int iocachectr; + /* Waitq for writepage completion */ wait_queue_head_t page_waitq; + /* waitq for direct-io completion */ + wait_queue_head_t direct_io_waitq; + /* List of writepage requestst (pending or sent) */ struct rb_root writepages; }; @@ -173,6 +189,10 @@ struct fuse_inode { #endif /** Submount specific lookup tracking */ struct fuse_submount_lookup *submount_lookup; +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct fuse_backing *fb; +#endif }; /** FUSE inode state bits */ @@ -187,19 +207,21 @@ enum { FUSE_I_BAD, /* Has btime */ FUSE_I_BTIME, + /* Wants or already has page cache IO */ + FUSE_I_CACHE_IO_MODE, }; struct fuse_conn; struct fuse_mount; -struct fuse_release_args; +union fuse_file_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_mount *fm; - /* Argument space reserved for release */ - struct fuse_release_args *release_args; + /* Argument space reserved for open/release */ + union fuse_file_args *args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -221,12 +243,6 @@ struct fuse_file { /* Readdir related */ struct { - /* - * Protects below fields against (crazy) parallel readdir on - * same open file. Uncontended in the normal case. - */ - struct mutex lock; - /* Dir stream position */ loff_t pos; @@ -244,6 +260,15 @@ struct fuse_file { /** Wait queue head for poll */ wait_queue_head_t poll_wait; + /** Does file hold a fi->iocachectr refcount? */ + enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** Reference to backing file in passthrough mode */ + struct file *passthrough; + const struct cred *cred; +#endif + /** Has flock been performed on this file? */ bool flock:1; }; @@ -283,9 +308,13 @@ struct fuse_args { bool page_replace:1; bool may_block:1; bool is_ext:1; + bool is_pinned:1; + bool invalidate_vmap:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); + /* Used for kvec iter backed by vmalloc address */ + void *vmap_base; }; struct fuse_args_pages { @@ -295,6 +324,19 @@ struct fuse_args_pages { unsigned int num_pages; }; +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + +union fuse_file_args { + /* Used during open() */ + struct fuse_open_out open_outarg; + /* Used during release() */ + struct fuse_release_args release_args; +}; + #define FUSE_ARGS(args) struct fuse_args args = {} /** The request IO state (for asynchronous processing) */ @@ -410,22 +452,19 @@ struct fuse_iqueue; */ struct fuse_iqueue_ops { /** - * Signal that a forget has been queued + * Send one forget */ - void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_forget)(struct fuse_iqueue *fiq, struct fuse_forget_link *link); /** - * Signal that an INTERRUPT request has been queued + * Send interrupt for request */ - void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_interrupt)(struct fuse_iqueue *fiq, struct fuse_req *req); /** - * Signal that a request has been queued + * Send one request */ - void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) - __releases(fiq->lock); + void (*send_req)(struct fuse_iqueue *fiq, struct fuse_req *req); /** * Clean up when fuse_iqueue is destroyed @@ -818,13 +857,22 @@ struct fuse_conn { /* Is statx not implemented by fs? */ unsigned int no_statx:1; + /** Passthrough support for read/write IO */ + unsigned int passthrough:1; + + /* Use pages instead of pointer for kernel I/O */ + unsigned int use_pages_for_kvec_io:1; + + /** Maximum stack depth for passthrough backing files */ + int max_stack_depth; + /** The number of requests waiting for completion */ atomic_t num_waiting; /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_mount_list */ + /** Entry on the fuse_conn_list */ struct list_head entry; /** Device ID from the root super block */ @@ -867,6 +915,11 @@ struct fuse_conn { /* New writepages go into this bucket */ struct fuse_sync_bucket __rcu *curr_bucket; + +#ifdef CONFIG_FUSE_PASSTHROUGH + /** IDR for backing files ids */ + struct idr backing_files_map; +#endif }; /* @@ -1003,10 +1056,6 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, - unsigned int max, - unsigned int *countp); - /* * Initialize READ or READDIR request */ @@ -1031,14 +1080,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, size_t count, int opcode); -/** - * Send OPEN or OPENDIR request - */ -int fuse_open_common(struct inode *inode, struct file *file, bool isdir); - -struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release); void fuse_file_free(struct fuse_file *ff); -void fuse_finish_open(struct inode *inode, struct file *file); +int fuse_finish_open(struct inode *inode, struct file *file); void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, unsigned int flags); @@ -1109,7 +1153,22 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +ssize_t __fuse_simple_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args); + +static inline ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) +{ + return __fuse_simple_request(&invalid_mnt_idmap, fm, args); +} + +static inline ssize_t fuse_simple_idmap_request(struct mnt_idmap *idmap, + struct fuse_mount *fm, + struct fuse_args *args) +{ + return __fuse_simple_request(idmap, fm, args); +} + int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); @@ -1285,8 +1344,8 @@ bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); -int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, - struct file *file); +int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr, struct file *file); void fuse_set_initialized(struct fuse_conn *fc); @@ -1299,7 +1358,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); -extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler * const fuse_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); @@ -1348,11 +1407,83 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); -/* file.c */ +/* iomode.c */ +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); +int fuse_inode_uncached_io_start(struct fuse_inode *fi, + struct fuse_backing *fb); +void fuse_inode_uncached_io_end(struct fuse_inode *fi); + +int fuse_file_io_open(struct file *file, struct inode *inode); +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode); +/* file.c */ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, bool isdir); void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +#ifdef CONFIG_FUSE_PASSTHROUGH +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); +void fuse_backing_put(struct fuse_backing *fb); +#else + +static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + return NULL; +} + +static inline void fuse_backing_put(struct fuse_backing *fb) +{ +} +#endif + +void fuse_backing_files_init(struct fuse_conn *fc); +void fuse_backing_files_free(struct fuse_conn *fc); +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); +int fuse_backing_close(struct fuse_conn *fc, int backing_id); + +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id); +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); + +static inline struct file *fuse_file_passthrough(struct fuse_file *ff) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return ff->passthrough; +#else + return NULL; +#endif +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter); +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags); +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/fuse_trace.h b/fs/fuse/fuse_trace.h new file mode 100644 index 000000000000..bbe9ddd8c716 --- /dev/null +++ b/fs/fuse/fuse_trace.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fuse + +#if !defined(_TRACE_FUSE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FUSE_H + +#include <linux/tracepoint.h> + +#define OPCODES \ + EM( FUSE_LOOKUP, "FUSE_LOOKUP") \ + EM( FUSE_FORGET, "FUSE_FORGET") \ + EM( FUSE_GETATTR, "FUSE_GETATTR") \ + EM( FUSE_SETATTR, "FUSE_SETATTR") \ + EM( FUSE_READLINK, "FUSE_READLINK") \ + EM( FUSE_SYMLINK, "FUSE_SYMLINK") \ + EM( FUSE_MKNOD, "FUSE_MKNOD") \ + EM( FUSE_MKDIR, "FUSE_MKDIR") \ + EM( FUSE_UNLINK, "FUSE_UNLINK") \ + EM( FUSE_RMDIR, "FUSE_RMDIR") \ + EM( FUSE_RENAME, "FUSE_RENAME") \ + EM( FUSE_LINK, "FUSE_LINK") \ + EM( FUSE_OPEN, "FUSE_OPEN") \ + EM( FUSE_READ, "FUSE_READ") \ + EM( FUSE_WRITE, "FUSE_WRITE") \ + EM( FUSE_STATFS, "FUSE_STATFS") \ + EM( FUSE_RELEASE, "FUSE_RELEASE") \ + EM( FUSE_FSYNC, "FUSE_FSYNC") \ + EM( FUSE_SETXATTR, "FUSE_SETXATTR") \ + EM( FUSE_GETXATTR, "FUSE_GETXATTR") \ + EM( FUSE_LISTXATTR, "FUSE_LISTXATTR") \ + EM( FUSE_REMOVEXATTR, "FUSE_REMOVEXATTR") \ + EM( FUSE_FLUSH, "FUSE_FLUSH") \ + EM( FUSE_INIT, "FUSE_INIT") \ + EM( FUSE_OPENDIR, "FUSE_OPENDIR") \ + EM( FUSE_READDIR, "FUSE_READDIR") \ + EM( FUSE_RELEASEDIR, "FUSE_RELEASEDIR") \ + EM( FUSE_FSYNCDIR, "FUSE_FSYNCDIR") \ + EM( FUSE_GETLK, "FUSE_GETLK") \ + EM( FUSE_SETLK, "FUSE_SETLK") \ + EM( FUSE_SETLKW, "FUSE_SETLKW") \ + EM( FUSE_ACCESS, "FUSE_ACCESS") \ + EM( FUSE_CREATE, "FUSE_CREATE") \ + EM( FUSE_INTERRUPT, "FUSE_INTERRUPT") \ + EM( FUSE_BMAP, "FUSE_BMAP") \ + EM( FUSE_DESTROY, "FUSE_DESTROY") \ + EM( FUSE_IOCTL, "FUSE_IOCTL") \ + EM( FUSE_POLL, "FUSE_POLL") \ + EM( FUSE_NOTIFY_REPLY, "FUSE_NOTIFY_REPLY") \ + EM( FUSE_BATCH_FORGET, "FUSE_BATCH_FORGET") \ + EM( FUSE_FALLOCATE, "FUSE_FALLOCATE") \ + EM( FUSE_READDIRPLUS, "FUSE_READDIRPLUS") \ + EM( FUSE_RENAME2, "FUSE_RENAME2") \ + EM( FUSE_LSEEK, "FUSE_LSEEK") \ + EM( FUSE_COPY_FILE_RANGE, "FUSE_COPY_FILE_RANGE") \ + EM( FUSE_SETUPMAPPING, "FUSE_SETUPMAPPING") \ + EM( FUSE_REMOVEMAPPING, "FUSE_REMOVEMAPPING") \ + EM( FUSE_SYNCFS, "FUSE_SYNCFS") \ + EM( FUSE_TMPFILE, "FUSE_TMPFILE") \ + EM( FUSE_STATX, "FUSE_STATX") \ + EMe(CUSE_INIT, "CUSE_INIT") + +/* + * This will turn the above table into TRACE_DEFINE_ENUM() for each of the + * entries. + */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +OPCODES + +/* Now we redfine it with the table that __print_symbolic needs. */ +#undef EM +#undef EMe +#define EM(a, b) {a, b}, +#define EMe(a, b) {a, b} + +TRACE_EVENT(fuse_request_send, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(enum fuse_opcode, opcode) + __field(uint32_t, len) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->opcode = req->in.h.opcode; + __entry->len = req->in.h.len; + ), + + TP_printk("connection %u req %llu opcode %u (%s) len %u ", + __entry->connection, __entry->unique, __entry->opcode, + __print_symbolic(__entry->opcode, OPCODES), __entry->len) +); + +TRACE_EVENT(fuse_request_end, + TP_PROTO(const struct fuse_req *req), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, connection) + __field(uint64_t, unique) + __field(uint32_t, len) + __field(int32_t, error) + ), + + TP_fast_assign( + __entry->connection = req->fm->fc->dev; + __entry->unique = req->in.h.unique; + __entry->len = req->out.h.len; + __entry->error = req->out.h.error; + ), + + TP_printk("connection %u req %llu len %u error %d", __entry->connection, + __entry->unique, __entry->len, __entry->error) +); + +#endif /* _TRACE_FUSE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE fuse_trace +#include <trace/define_trace.h> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 735abf426a06..fd3321e29a3e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) goto out_free_forget; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_inode_backing_set(fi, NULL); + return &fi->inode; out_free_forget: @@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode) #ifdef CONFIG_FUSE_DAX kfree(fi->dax); #endif + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_put(fuse_inode_backing(fi)); + kmem_cache_free(fuse_inode_cachep, fi); } @@ -169,6 +175,7 @@ static void fuse_evict_inode(struct inode *inode) } } if (S_ISREG(inode->i_mode) && !fuse_is_bad(inode)) { + WARN_ON(fi->iocachectr != 0); WARN_ON(!list_empty(&fi->write_files)); WARN_ON(!list_empty(&fi->queued_writes)); } @@ -223,12 +230,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); - inode->i_atime.tv_sec = attr->atime; - inode->i_atime.tv_nsec = attr->atimensec; + inode_set_atime(inode, attr->atime, attr->atimensec); /* mtime from server may be stale due to local buffered write */ if (!(cache_mask & STATX_MTIME)) { - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); } if (!(cache_mask & STATX_CTIME)) { inode_set_ctime(inode, attr->ctime, attr->ctimensec); @@ -311,12 +316,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); if (cache_mask & STATX_MTIME) { - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; + attr->mtime = inode_get_mtime_sec(inode); + attr->mtimensec = inode_get_mtime_nsec(inode); } if (cache_mask & STATX_CTIME) { - attr->ctime = inode_get_ctime(inode).tv_sec; - attr->ctimensec = inode_get_ctime(inode).tv_nsec; + attr->ctime = inode_get_ctime_sec(inode); + attr->ctimensec = inode_get_ctime_nsec(inode); } if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -325,7 +330,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = inode->i_mtime; + old_mtime = inode_get_mtime(inode); fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); oldsize = inode->i_size; @@ -379,8 +384,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); @@ -736,8 +740,8 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { fsparam_string ("source", OPT_SOURCE), fsparam_u32 ("fd", OPT_FD), fsparam_u32oct ("rootmode", OPT_ROOTMODE), - fsparam_u32 ("user_id", OPT_USER_ID), - fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_uid ("user_id", OPT_USER_ID), + fsparam_gid ("group_id", OPT_GROUP_ID), fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), fsparam_flag ("allow_other", OPT_ALLOW_OTHER), fsparam_u32 ("max_read", OPT_MAX_READ), @@ -797,9 +801,7 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) break; case OPT_USER_ID: - kuid = make_kuid(fsc->user_ns, result.uint_32); - if (!uid_valid(kuid)) - return invalfc(fsc, "Invalid user_id"); + kuid = result.uid; /* * The requested uid must be representable in the * filesystem's idmapping. @@ -811,9 +813,7 @@ static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) break; case OPT_GROUP_ID: - kgid = make_kgid(fsc->user_ns, result.uint_32);; - if (!gid_valid(kgid)) - return invalfc(fsc, "Invalid group_id"); + kgid = result.gid; /* * The requested gid must be representable in the * filesystem's idmapping. @@ -946,6 +946,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_init(fc); + INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); fm->fc = fc; @@ -976,6 +979,8 @@ void fuse_conn_put(struct fuse_conn *fc) WARN_ON(atomic_read(&bucket->count) != 1); kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); call_rcu(&fc->rcu, delayed_release); } } @@ -996,7 +1001,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) attr.mode = mode; attr.ino = FUSE_ROOT_ID; attr.nlink = 1; - return fuse_iget(sb, 1, 0, &attr, 0, 0); + return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0); } struct fuse_inode_handle { @@ -1080,7 +1085,7 @@ static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len, } *max_len = len; - return parent ? 0x82 : 0x81; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *fuse_fh_to_dentry(struct super_block *sb, @@ -1088,7 +1093,8 @@ static struct dentry *fuse_fh_to_dentry(struct super_block *sb, { struct fuse_inode_handle handle; - if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3) + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) return NULL; handle.nodeid = (u64) fid->raw[0] << 32; @@ -1102,7 +1108,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, { struct fuse_inode_handle parent; - if (fh_type != 0x82 || fh_len < 6) + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) return NULL; parent.nodeid = (u64) fid->raw[3] << 32; @@ -1138,6 +1144,11 @@ static struct dentry *fuse_get_parent(struct dentry *child) return parent; } +/* only for fid encoding; no support for file handle */ +static const struct export_operations fuse_export_fid_operations = { + .encode_fh = fuse_encode_fh, +}; + static const struct export_operations fuse_export_operations = { .fh_to_dentry = fuse_fh_to_dentry, .fh_to_parent = fuse_fh_to_parent, @@ -1312,6 +1323,37 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->create_supp_group = 1; if (flags & FUSE_DIRECT_IO_ALLOW_MMAP) fc->direct_io_allow_mmap = 1; + /* + * max_stack_depth is the max stack depth of FUSE fs, + * so it has to be at least 1 to support passthrough + * to backing files. + * + * with max_stack_depth > 1, the backing files can be + * on a stacked fs (e.g. overlayfs) themselves and with + * max_stack_depth == 1, FUSE fs can be stacked as the + * underlying fs of a stacked fs (e.g. overlayfs). + * + * Also don't allow the combination of FUSE_PASSTHROUGH + * and FUSE_WRITEBACK_CACHE, current design doesn't handle + * them together. + */ + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) && + (flags & FUSE_PASSTHROUGH) && + arg->max_stack_depth > 0 && + arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH && + !(flags & FUSE_WRITEBACK_CACHE)) { + fc->passthrough = 1; + fc->max_stack_depth = arg->max_stack_depth; + fm->sb->s_stack_depth = arg->max_stack_depth; + } + if (flags & FUSE_NO_EXPORT_SUPPORT) + fm->sb->s_export_op = &fuse_export_fid_operations; + if (flags & FUSE_ALLOW_IDMAP) { + if (fc->default_permissions) + fm->sb->s_iflags &= ~SB_I_NOIDMAP; + else + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1358,7 +1400,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | - FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP; + FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1367,6 +1410,8 @@ void fuse_send_init(struct fuse_mount *fm) #endif if (fm->fc->auto_submounts) flags |= FUSE_SUBMOUNTS; + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + flags |= FUSE_PASSTHROUGH; ia->in.flags = flags; ia->in.flags2 = flags >> 32; @@ -1501,22 +1546,24 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { + struct timespec64 atime = inode_get_atime(&fi->inode); + struct timespec64 mtime = inode_get_mtime(&fi->inode); struct timespec64 ctime = inode_get_ctime(&fi->inode); *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, - .atime = fi->inode.i_atime.tv_sec, - .mtime = fi->inode.i_mtime.tv_sec, + .atime = atime.tv_sec, + .mtime = mtime.tv_sec, .ctime = ctime.tv_sec, - .atimensec = fi->inode.i_atime.tv_nsec, - .mtimensec = fi->inode.i_mtime.tv_nsec, + .atimensec = atime.tv_nsec, + .mtimensec = mtime.tv_nsec, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, - .uid = fi->inode.i_uid.val, - .gid = fi->inode.i_gid.val, + .uid = __kuid_val(fi->inode.i_uid), + .gid = __kgid_val(fi->inode.i_gid), .rdev = fi->inode.i_rdev, .blksize = 1u << fi->inode.i_blkbits, }; @@ -1531,6 +1578,7 @@ static void fuse_sb_defaults(struct super_block *sb) sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + sb->s_iflags |= SB_I_NOIDMAP; if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); @@ -1553,6 +1601,7 @@ static int fuse_fill_super_submount(struct super_block *sb, sb->s_bdi = bdi_get(parent_sb->s_bdi); sb->s_xattr = parent_sb->s_xattr; + sb->s_export_op = parent_sb->s_export_op; sb->s_time_gran = parent_sb->s_time_gran; sb->s_blocksize = parent_sb->s_blocksize; sb->s_blocksize_bits = parent_sb->s_blocksize_bits; @@ -1942,7 +1991,7 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT | FS_ALLOW_IDMAP, .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, @@ -1963,7 +2012,7 @@ static struct file_system_type fuseblk_fs_type = { .init_fs_context = fuse_init_fs_context, .parameters = fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, - .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, + .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("fuseblk"); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 726640fa439e..572ce8a82ceb 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -8,6 +8,7 @@ #include <linux/uio.h> #include <linux/compat.h> #include <linux/fileattr.h> +#include <linux/fsverity.h> static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, struct fuse_ioctl_out *outarg) @@ -117,6 +118,53 @@ static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, return 0; } +/* For fs-verity, determine iov lengths from input */ +static int fuse_setup_measure_verity(unsigned long arg, struct iovec *iov) +{ + __u16 digest_size; + struct fsverity_digest __user *uarg = (void __user *)arg; + + if (copy_from_user(&digest_size, &uarg->digest_size, sizeof(digest_size))) + return -EFAULT; + + if (digest_size > SIZE_MAX - sizeof(struct fsverity_digest)) + return -EINVAL; + + iov->iov_len = sizeof(struct fsverity_digest) + digest_size; + + return 0; +} + +static int fuse_setup_enable_verity(unsigned long arg, struct iovec *iov, + unsigned int *in_iovs) +{ + struct fsverity_enable_arg enable; + struct fsverity_enable_arg __user *uarg = (void __user *)arg; + const __u32 max_buffer_len = FUSE_MAX_MAX_PAGES * PAGE_SIZE; + + if (copy_from_user(&enable, uarg, sizeof(enable))) + return -EFAULT; + + if (enable.salt_size > max_buffer_len || enable.sig_size > max_buffer_len) + return -ENOMEM; + + if (enable.salt_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.salt_ptr); + iov->iov_len = enable.salt_size; + } + + if (enable.sig_size > 0) { + iov++; + (*in_iovs)++; + + iov->iov_base = u64_to_user_ptr(enable.sig_ptr); + iov->iov_len = enable.sig_size; + } + return 0; +} /* * For ioctls, there is no generic way to determine how much memory @@ -227,6 +275,18 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, out_iov = iov; out_iovs = 1; } + + err = 0; + switch (cmd) { + case FS_IOC_MEASURE_VERITY: + err = fuse_setup_measure_verity(arg, iov); + break; + case FS_IOC_ENABLE_VERITY: + err = fuse_setup_enable_verity(arg, iov, &in_iovs); + break; + } + if (err) + goto out; } retry: diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c new file mode 100644 index 000000000000..c99e285f3183 --- /dev/null +++ b/fs/fuse/iomode.c @@ -0,0 +1,276 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE inode io modes. + * + * Copyright (c) 2024 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/file.h> +#include <linux/fs.h> + +/* + * Return true if need to wait for new opens in caching mode. + */ +static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi) +{ + return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi); +} + +/* + * Called on cached file open() and on first mmap() of direct_io file. + * Takes cached_io inode mode reference to be dropped on file release. + * + * Blocks new parallel dio writes and waits for the in-progress parallel dio + * writes to complete. + */ +int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* There are no io modes if server does not implement open */ + if (!ff->args) + return 0; + + spin_lock(&fi->lock); + /* + * Setting the bit advises new direct-io writes to use an exclusive + * lock - without it the wait below might be forever. + */ + while (fuse_is_io_cache_wait(fi)) { + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi)); + spin_lock(&fi->lock); + } + + /* + * Check if inode entered passthrough io mode while waiting for parallel + * dio write completion. + */ + if (fuse_inode_backing(fi)) { + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); + return -ETXTBSY; + } + + WARN_ON(ff->iomode == IOM_UNCACHED); + if (ff->iomode == IOM_NONE) { + ff->iomode = IOM_CACHED; + if (fi->iocachectr == 0) + set_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + fi->iocachectr++; + } + spin_unlock(&fi->lock); + return 0; +} + +static void fuse_file_cached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr <= 0); + WARN_ON(ff->iomode != IOM_CACHED); + ff->iomode = IOM_NONE; + fi->iocachectr--; + if (fi->iocachectr == 0) + clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state); + spin_unlock(&fi->lock); +} + +/* Start strictly uncached io mode where cache access is not allowed */ +int fuse_inode_uncached_io_start(struct fuse_inode *fi, struct fuse_backing *fb) +{ + struct fuse_backing *oldfb; + int err = 0; + + spin_lock(&fi->lock); + /* deny conflicting backing files on same fuse inode */ + oldfb = fuse_inode_backing(fi); + if (fb && oldfb && oldfb != fb) { + err = -EBUSY; + goto unlock; + } + if (fi->iocachectr > 0) { + err = -ETXTBSY; + goto unlock; + } + fi->iocachectr--; + + /* fuse inode holds a single refcount of backing file */ + if (fb && !oldfb) { + oldfb = fuse_inode_backing_set(fi, fb); + WARN_ON_ONCE(oldfb != NULL); + } else { + fuse_backing_put(fb); + } +unlock: + spin_unlock(&fi->lock); + return err; +} + +/* Takes uncached_io inode mode reference to be dropped on file release */ +static int fuse_file_uncached_io_open(struct inode *inode, + struct fuse_file *ff, + struct fuse_backing *fb) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + err = fuse_inode_uncached_io_start(fi, fb); + if (err) + return err; + + WARN_ON(ff->iomode != IOM_NONE); + ff->iomode = IOM_UNCACHED; + return 0; +} + +void fuse_inode_uncached_io_end(struct fuse_inode *fi) +{ + struct fuse_backing *oldfb = NULL; + + spin_lock(&fi->lock); + WARN_ON(fi->iocachectr >= 0); + fi->iocachectr++; + if (!fi->iocachectr) { + wake_up(&fi->direct_io_waitq); + oldfb = fuse_inode_backing_set(fi, NULL); + } + spin_unlock(&fi->lock); + if (oldfb) + fuse_backing_put(oldfb); +} + +/* Drop uncached_io reference from passthrough open */ +static void fuse_file_uncached_io_release(struct fuse_file *ff, + struct fuse_inode *fi) +{ + WARN_ON(ff->iomode != IOM_UNCACHED); + ff->iomode = IOM_NONE; + fuse_inode_uncached_io_end(fi); +} + +/* + * Open flags that are allowed in combination with FOPEN_PASSTHROUGH. + * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write + * operations go directly to the server, but mmap is done on the backing file. + * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode + * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination. + */ +#define FOPEN_PASSTHROUGH_MASK \ + (FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \ + FOPEN_NOFLUSH) + +static int fuse_file_passthrough_open(struct inode *inode, struct file *file) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_backing *fb; + int err; + + /* Check allowed conditions for file open in passthrough mode */ + if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough || + (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) + return -EINVAL; + + fb = fuse_passthrough_open(file, inode, + ff->args->open_outarg.backing_id); + if (IS_ERR(fb)) + return PTR_ERR(fb); + + /* First passthrough file open denies caching inode io mode */ + err = fuse_file_uncached_io_open(inode, ff, fb); + if (!err) + return 0; + + fuse_passthrough_release(ff, fb); + fuse_backing_put(fb); + + return err; +} + +/* Request access to submit new io to inode via open file */ +int fuse_file_io_open(struct file *file, struct inode *inode) +{ + struct fuse_file *ff = file->private_data; + struct fuse_inode *fi = get_fuse_inode(inode); + int err; + + /* + * io modes are not relevant with DAX and with server that does not + * implement open. + */ + if (FUSE_IS_DAX(inode) || !ff->args) + return 0; + + /* + * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode + * which is already open for passthrough. + */ + err = -EINVAL; + if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH)) + goto fail; + + /* + * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO. + */ + if (!(ff->open_flags & FOPEN_DIRECT_IO)) + ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES; + + /* + * First passthrough file open denies caching inode io mode. + * First caching file open enters caching inode io mode. + * + * Note that if user opens a file open with O_DIRECT, but server did + * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT, + * so we put the inode in caching mode to prevent parallel dio. + */ + if ((ff->open_flags & FOPEN_DIRECT_IO) && + !(ff->open_flags & FOPEN_PASSTHROUGH)) + return 0; + + if (ff->open_flags & FOPEN_PASSTHROUGH) + err = fuse_file_passthrough_open(inode, file); + else + err = fuse_file_cached_io_open(inode, ff); + if (err) + goto fail; + + return 0; + +fail: + pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n", + ff->open_flags, err); + /* + * The file open mode determines the inode io mode. + * Using incorrect open mode is a server mistake, which results in + * user visible failure of open() with EIO error. + */ + return -EIO; +} + +/* No more pending io and no new io possible to inode via open/mmapped file */ +void fuse_file_io_release(struct fuse_file *ff, struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * Last passthrough file close allows caching inode io mode. + * Last caching file close exits caching inode io mode. + */ + switch (ff->iomode) { + case IOM_NONE: + /* Nothing to do */ + break; + case IOM_UNCACHED: + fuse_file_uncached_io_release(ff, fi); + break; + case IOM_CACHED: + fuse_file_cached_io_release(ff, fi); + break; + } +} diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c new file mode 100644 index 000000000000..bbac547dfcb3 --- /dev/null +++ b/fs/fuse/passthrough.c @@ -0,0 +1,351 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/file.h> +#include <linux/backing-file.h> +#include <linux/splice.h> + +static void fuse_file_accessed(struct file *file) +{ + struct inode *inode = file_inode(file); + + fuse_invalidate_atime(inode); +} + +static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret) +{ + struct inode *inode = file_inode(file); + + fuse_write_update_attr(inode, pos, ret); +} + +ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + + return ret; +} + +ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, + struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + size_t count = iov_iter_count(iter); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .end_write = fuse_passthrough_end_write, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__, + backing_file, iocb->ki_pos, count); + + if (!count) + return 0; + + inode_lock(inode); + ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = in->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = in, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + return backing_file_splice_read(backing_file, ppos, pipe, len, flags, + &ctx); +} + +ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + struct fuse_file *ff = out->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct inode *inode = file_inode(out); + ssize_t ret; + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = out, + .end_write = fuse_passthrough_end_write, + }; + + pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__, + backing_file, ppos ? *ppos : 0, len, flags); + + inode_lock(inode); + ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags, + &ctx); + inode_unlock(inode); + + return ret; +} + +ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct fuse_file *ff = file->private_data; + struct file *backing_file = fuse_file_passthrough(ff); + struct backing_file_ctx ctx = { + .cred = ff->cred, + .user_file = file, + .accessed = fuse_file_accessed, + }; + + pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__, + backing_file, vma->vm_start, vma->vm_end); + + return backing_file_mmap(backing_file, vma, &ctx); +} + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +/* + * Setup passthrough to a backing file. + * + * Returns an fb object with elevated refcount to be stored in fuse inode. + */ +struct fuse_backing *fuse_passthrough_open(struct file *file, + struct inode *inode, + int backing_id) +{ + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fm->fc; + struct fuse_backing *fb = NULL; + struct file *backing_file; + int err; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + err = -ENOENT; + if (!fb) + goto out; + + /* Allocate backing file per fuse file to store fuse path */ + backing_file = backing_file_open(&file->f_path, file->f_flags, + &fb->file->f_path, fb->cred); + err = PTR_ERR(backing_file); + if (IS_ERR(backing_file)) { + fuse_backing_put(fb); + goto out; + } + + err = 0; + ff->passthrough = backing_file; + ff->cred = get_cred(fb->cred); +out: + pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__, + backing_id, fb, ff->passthrough, err); + + return err ? ERR_PTR(err) : fb; +} + +void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__, + fb, ff->passthrough); + + fput(ff->passthrough); + ff->passthrough = NULL; + put_cred(ff->cred); + ff->cred = NULL; +} diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 9e6d587b3e67..0377b6dc24c8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -476,7 +476,7 @@ retry_locked: if (!fi->rdc.cached) { /* Starting cache? Set cache mtime. */ if (!ctx->pos && !fi->rdc.size) { - fi->rdc.mtime = inode->i_mtime; + fi->rdc.mtime = inode_get_mtime(inode); fi->rdc.iversion = inode_query_iversion(inode); } spin_unlock(&fi->rdc.lock); @@ -488,8 +488,10 @@ retry_locked: * changed, and reset the cache if so. */ if (!ctx->pos) { + struct timespec64 mtime = inode_get_mtime(inode); + if (inode_peek_iversion(inode) != fi->rdc.iversion || - !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + !timespec64_equal(&fi->rdc.mtime, &mtime)) { fuse_rdc_reset(inode); goto retry_locked; } @@ -590,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx) if (fuse_is_bad(inode)) return -EIO; - mutex_lock(&ff->readdir.lock); - err = UNCACHED; if (ff->open_flags & FOPEN_CACHE_DIR) err = fuse_readdir_cached(file, ctx); if (err == UNCACHED) err = fuse_readdir_uncached(file, ctx); - mutex_unlock(&ff->readdir.lock); - return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index d84dacbdce2c..749c9f66d74c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -7,6 +7,8 @@ #include <linux/fs.h> #include <linux/dax.h> #include <linux/pci.h> +#include <linux/interrupt.h> +#include <linux/group_cpus.h> #include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> @@ -16,6 +18,7 @@ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" @@ -31,6 +34,9 @@ static DEFINE_MUTEX(virtio_fs_mutex); static LIST_HEAD(virtio_fs_instances); +/* The /sys/fs/virtio_fs/ kset */ +static struct kset *virtio_fs_kset; + enum { VQ_HIPRIO, VQ_REQUEST @@ -45,17 +51,19 @@ struct virtio_fs_vq { struct work_struct done_work; struct list_head queued_reqs; struct list_head end_reqs; /* End these requests */ - struct delayed_work dispatch_work; + struct work_struct dispatch_work; struct fuse_dev *fud; bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ + struct kobject *kobj; char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ struct virtio_fs { - struct kref refcount; + struct kobject kobj; + struct kobject *mqs_kobj; struct list_head list; /* on virtio_fs_instances */ char *tag; struct virtio_fs_vq *vqs; @@ -63,6 +71,8 @@ struct virtio_fs { unsigned int num_request_queues; /* number of request queues */ struct dax_device *dax_dev; + unsigned int *mq_map; /* index = cpu id, value = request vq id */ + /* DAX memory window where file contents are mapped */ void *window_kaddr; phys_addr_t window_phys_addr; @@ -161,27 +171,125 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) complete(&fsvq->in_flight_zero); } -static void release_virtio_fs_obj(struct kref *ref) +static ssize_t tag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + return sysfs_emit(buf, "%s\n", fs->tag); +} + +static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag); + +static struct attribute *virtio_fs_attrs[] = { + &virtio_fs_tag_attr.attr, + NULL +}; +ATTRIBUTE_GROUPS(virtio_fs); + +static void virtio_fs_ktype_release(struct kobject *kobj) { - struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj); + kfree(vfs->mq_map); kfree(vfs->vqs); kfree(vfs); } +static const struct kobj_type virtio_fs_ktype = { + .release = virtio_fs_ktype_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = virtio_fs_groups, +}; + +static struct virtio_fs_vq *virtio_fs_kobj_to_vq(struct virtio_fs *fs, + struct kobject *kobj) +{ + int i; + + for (i = 0; i < fs->nvqs; i++) { + if (kobj == fs->vqs[i].kobj) + return &fs->vqs[i]; + } + return NULL; +} + +static ssize_t name_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + + if (!fsvq) + return -EINVAL; + return sysfs_emit(buf, "%s\n", fsvq->name); +} + +static struct kobj_attribute virtio_fs_vq_name_attr = __ATTR_RO(name); + +static ssize_t cpu_list_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct virtio_fs *fs = container_of(kobj->parent->parent, struct virtio_fs, kobj); + struct virtio_fs_vq *fsvq = virtio_fs_kobj_to_vq(fs, kobj); + unsigned int cpu, qid; + const size_t size = PAGE_SIZE - 1; + bool first = true; + int ret = 0, pos = 0; + + if (!fsvq) + return -EINVAL; + + qid = fsvq->vq->index; + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (qid < VQ_REQUEST || (fs->mq_map[cpu] == qid - VQ_REQUEST)) { + if (first) + ret = snprintf(buf + pos, size - pos, "%u", cpu); + else + ret = snprintf(buf + pos, size - pos, ", %u", cpu); + + if (ret >= size - pos) + break; + first = false; + pos += ret; + } + } + ret = snprintf(buf + pos, size + 1 - pos, "\n"); + return pos + ret; +} + +static struct kobj_attribute virtio_fs_vq_cpu_list_attr = __ATTR_RO(cpu_list); + +static struct attribute *virtio_fs_vq_attrs[] = { + &virtio_fs_vq_name_attr.attr, + &virtio_fs_vq_cpu_list_attr.attr, + NULL +}; + +static struct attribute_group virtio_fs_vq_attr_group = { + .attrs = virtio_fs_vq_attrs, +}; + /* Make sure virtiofs_mutex is held */ +static void virtio_fs_put_locked(struct virtio_fs *fs) +{ + lockdep_assert_held(&virtio_fs_mutex); + + kobject_put(&fs->kobj); +} + static void virtio_fs_put(struct virtio_fs *fs) { - kref_put(&fs->refcount, release_virtio_fs_obj); + mutex_lock(&virtio_fs_mutex); + virtio_fs_put_locked(fs); + mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) { struct virtio_fs *vfs = fiq->priv; - mutex_lock(&virtio_fs_mutex); virtio_fs_put(vfs); - mutex_unlock(&virtio_fs_mutex); } static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) @@ -202,7 +310,7 @@ static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) } flush_work(&fsvq->done_work); - flush_delayed_work(&fsvq->dispatch_work); + flush_work(&fsvq->dispatch_work); } static void virtio_fs_drain_all_queues_locked(struct virtio_fs *fs) @@ -242,27 +350,107 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs) } } +static void virtio_fs_delete_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + kobject_put(fsvq->kobj); + } +} + +static int virtio_fs_add_queues_sysfs(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + char buff[12]; + int i, j, ret; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + + sprintf(buff, "%d", i); + fsvq->kobj = kobject_create_and_add(buff, fs->mqs_kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; + } + + ret = sysfs_create_group(fsvq->kobj, &virtio_fs_vq_attr_group); + if (ret) { + kobject_put(fsvq->kobj); + goto out_del; + } + } + + return 0; + +out_del: + for (j = 0; j < i; j++) { + fsvq = &fs->vqs[j]; + kobject_put(fsvq->kobj); + } + return ret; +} + /* Add a new instance to the list or return -EEXIST if tag name exists*/ -static int virtio_fs_add_instance(struct virtio_fs *fs) +static int virtio_fs_add_instance(struct virtio_device *vdev, + struct virtio_fs *fs) { struct virtio_fs *fs2; - bool duplicate = false; + int ret; mutex_lock(&virtio_fs_mutex); list_for_each_entry(fs2, &virtio_fs_instances, list) { - if (strcmp(fs->tag, fs2->tag) == 0) - duplicate = true; + if (strcmp(fs->tag, fs2->tag) == 0) { + mutex_unlock(&virtio_fs_mutex); + return -EEXIST; + } } - if (!duplicate) - list_add_tail(&fs->list, &virtio_fs_instances); + /* Use the virtio_device's index as a unique identifier, there is no + * need to allocate our own identifiers because the virtio_fs instance + * is only visible to userspace as long as the underlying virtio_device + * exists. + */ + fs->kobj.kset = virtio_fs_kset; + ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index); + if (ret < 0) + goto out_unlock; + + fs->mqs_kobj = kobject_create_and_add("mqs", &fs->kobj); + if (!fs->mqs_kobj) { + ret = -ENOMEM; + goto out_del; + } + + ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device"); + if (ret < 0) + goto out_put; + + ret = virtio_fs_add_queues_sysfs(fs); + if (ret) + goto out_remove; + + list_add_tail(&fs->list, &virtio_fs_instances); mutex_unlock(&virtio_fs_mutex); - if (duplicate) - return -EEXIST; + kobject_uevent(&fs->kobj, KOBJ_ADD); + return 0; + +out_remove: + sysfs_remove_link(&fs->kobj, "device"); +out_put: + kobject_put(fs->mqs_kobj); +out_del: + kobject_del(&fs->kobj); +out_unlock: + mutex_unlock(&virtio_fs_mutex); + return ret; } /* Return the virtio_fs with a given tag, or NULL */ @@ -274,7 +462,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag) list_for_each_entry(fs, &virtio_fs_instances, list) { if (strcmp(fs->tag, tag) == 0) { - kref_get(&fs->refcount); + kobject_get(&fs->kobj); goto found; } } @@ -355,7 +543,11 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work) kfree(req); dec_in_flight_req(fsvq); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); + + if (!list_empty(&fsvq->queued_reqs)) + schedule_work(&fsvq->dispatch_work); + spin_unlock(&fsvq->lock); } @@ -363,7 +555,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) { struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, - dispatch_work.work); + dispatch_work); int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -395,11 +587,9 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) ret = virtio_fs_enqueue_req(fsvq, req, true); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); spin_unlock(&fsvq->lock); return; } @@ -442,12 +632,10 @@ static int send_forget_request(struct virtio_fs_vq *fsvq, ret = virtqueue_add_outbuf(vq, &sg, 1, forget, GFP_ATOMIC); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", ret); list_add_tail(&forget->list, &fsvq->queued_reqs); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); if (!in_flight) inc_in_flight_req(fsvq); /* Queue is full */ @@ -479,7 +667,7 @@ static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) { struct virtio_fs_forget *forget; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, - dispatch_work.work); + dispatch_work); pr_debug("virtio-fs: worker %s called.\n", __func__); while (1) { spin_lock(&fsvq->lock); @@ -637,7 +825,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) list_move_tail(&req->list, &reqs); spin_unlock(&fpq->lock); } - } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + } while (!virtqueue_enable_cb(vq)); spin_unlock(&fsvq->lock); /* End requests */ @@ -657,6 +845,50 @@ static void virtio_fs_requests_done_work(struct work_struct *work) virtio_fs_request_complete(req, fsvq); } } + + /* Try to push previously queued requests, as the queue might no longer be full */ + spin_lock(&fsvq->lock); + if (!list_empty(&fsvq->queued_reqs)) + schedule_work(&fsvq->dispatch_work); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) +{ + const struct cpumask *mask, *masks; + unsigned int q, cpu; + + /* First attempt to map using existing transport layer affinities + * e.g. PCIe MSI-X + */ + if (!vdev->config->get_vq_affinity) + goto fallback; + + for (q = 0; q < fs->num_request_queues; q++) { + mask = vdev->config->get_vq_affinity(vdev, VQ_REQUEST + q); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + fs->mq_map[cpu] = q; + } + + return; +fallback: + /* Attempt to map evenly in groups over the CPUs */ + masks = group_cpus_evenly(fs->num_request_queues); + /* If even this fails we default to all CPUs use queue zero */ + if (!masks) { + for_each_possible_cpu(cpu) + fs->mq_map[cpu] = 0; + return; + } + + for (q = 0; q < fs->num_request_queues; q++) { + for_each_cpu(cpu, &masks[q]) + fs->mq_map[cpu] = q; + } + kfree(masks); } /* Virtqueue interrupt handler */ @@ -680,12 +912,12 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, if (vq_type == VQ_REQUEST) { INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fsvq->dispatch_work, - virtio_fs_request_dispatch_work); + INIT_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); } else { INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); - INIT_DELAYED_WORK(&fsvq->dispatch_work, - virtio_fs_hiprio_dispatch_work); + INIT_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); } } @@ -693,9 +925,13 @@ static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) { + struct virtqueue_info *vqs_info; struct virtqueue **vqs; - vq_callback_t **callbacks; - const char **names; + /* Specify pre_vectors to ensure that the queues before the + * request queues (e.g. hiprio) don't claim any of the CPUs in + * the multi-queue mapping and interrupt affinities + */ + struct irq_affinity desc = { .pre_vectors = VQ_REQUEST }; unsigned int i; int ret = 0; @@ -704,24 +940,27 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; + /* Truncate nr of request queues to nr_cpu_id */ + fs->num_request_queues = min_t(unsigned int, fs->num_request_queues, + nr_cpu_ids); fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); - callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), - GFP_KERNEL); - names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); - if (!vqs || !callbacks || !names) { + fs->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*fs->mq_map), GFP_KERNEL, + dev_to_node(&vdev->dev)); + vqs_info = kcalloc(fs->nvqs, sizeof(*vqs_info), GFP_KERNEL); + if (!vqs || !vqs_info || !fs->mq_map) { ret = -ENOMEM; goto out; } /* Initialize the hiprio/forget request virtqueue */ - callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + vqs_info[VQ_HIPRIO].callback = virtio_fs_vq_done; virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); - names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + vqs_info[VQ_HIPRIO].name = fs->vqs[VQ_HIPRIO].name; /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { @@ -729,11 +968,11 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); - callbacks[i] = virtio_fs_vq_done; - names[i] = fs->vqs[i].name; + vqs_info[i].callback = virtio_fs_vq_done; + vqs_info[i].name = fs->vqs[i].name; } - ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, vqs_info, &desc); if (ret < 0) goto out; @@ -742,11 +981,12 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, virtio_fs_start_all_queues(fs); out: - kfree(names); - kfree(callbacks); + kfree(vqs_info); kfree(vqs); - if (ret) + if (ret) { kfree(fs->vqs); + kfree(fs->mq_map); + } return ret; } @@ -805,8 +1045,11 @@ static void virtio_fs_cleanup_dax(void *data) put_dax(dax_dev); } +DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T)) + static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) { + struct dax_device *dax_dev __free(cleanup_dax) = NULL; struct virtio_shm_region cache_reg; struct dev_pagemap *pgmap; bool have_cache; @@ -814,6 +1057,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) if (!IS_ENABLED(CONFIG_FUSE_DAX)) return 0; + dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); + if (IS_ERR(dax_dev)) { + int rc = PTR_ERR(dax_dev); + return rc == -EOPNOTSUPP ? 0 : rc; + } + /* Get cache region */ have_cache = virtio_get_shm_region(vdev, &cache_reg, (u8)VIRTIO_FS_SHMCAP_ID_CACHE); @@ -859,10 +1108,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); - fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops); - if (IS_ERR(fs->dax_dev)) - return PTR_ERR(fs->dax_dev); - + fs->dax_dev = no_free_ptr(dax_dev); return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, fs->dax_dev); } @@ -875,7 +1121,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) fs = kzalloc(sizeof(*fs), GFP_KERNEL); if (!fs) return -ENOMEM; - kref_init(&fs->refcount); + kobject_init(&fs->kobj, &virtio_fs_ktype); vdev->priv = fs; ret = virtio_fs_read_tag(vdev, fs); @@ -886,7 +1132,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) if (ret < 0) goto out; - /* TODO vq affinity */ + virtio_fs_map_queues(vdev, fs); ret = virtio_fs_setup_dax(vdev, fs); if (ret < 0) @@ -897,7 +1143,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) */ virtio_device_ready(vdev); - ret = virtio_fs_add_instance(fs); + ret = virtio_fs_add_instance(vdev, fs); if (ret < 0) goto out_vqs; @@ -906,11 +1152,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); virtio_fs_cleanup_vqs(vdev); - kfree(fs->vqs); out: vdev->priv = NULL; - kfree(fs); + kobject_put(&fs->kobj); return ret; } @@ -934,6 +1179,10 @@ static void virtio_fs_remove(struct virtio_device *vdev) mutex_lock(&virtio_fs_mutex); /* This device is going away. No one should get new reference */ list_del_init(&fs->list); + virtio_fs_delete_queues_sysfs(fs); + sysfs_remove_link(&fs->kobj, "device"); + kobject_put(fs->mqs_kobj); + kobject_del(&fs->kobj); virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); @@ -941,7 +1190,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) vdev->priv = NULL; /* Put device reference on virtio_fs object */ - virtio_fs_put(fs); + virtio_fs_put_locked(fs); mutex_unlock(&virtio_fs_mutex); } @@ -969,7 +1218,6 @@ static const unsigned int feature_table[] = {}; static struct virtio_driver virtio_fs_driver = { .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, .id_table = id_table, .feature_table = feature_table, .feature_table_size = ARRAY_SIZE(feature_table), @@ -981,22 +1229,13 @@ static struct virtio_driver virtio_fs_driver = { #endif }; -static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *link) { - struct fuse_forget_link *link; struct virtio_fs_forget *forget; struct virtio_fs_forget_req *req; - struct virtio_fs *fs; - struct virtio_fs_vq *fsvq; - u64 unique; - - link = fuse_dequeue_forget(fiq, 1, NULL); - unique = fuse_get_unique(fiq); - - fs = fiq->priv; - fsvq = &fs->vqs[VQ_HIPRIO]; - spin_unlock(&fiq->lock); + struct virtio_fs *fs = fiq->priv; + struct virtio_fs_vq *fsvq = &fs->vqs[VQ_HIPRIO]; + u64 unique = fuse_get_unique(fiq); /* Allocate a buffer for the request */ forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); @@ -1016,8 +1255,7 @@ __releases(fiq->lock) kfree(link); } -static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { /* * TODO interrupts. @@ -1026,7 +1264,6 @@ __releases(fiq->lock) * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) * with shared lock between host and guest. */ - spin_unlock(&fiq->lock); } /* Count number of scatter-gather elements required */ @@ -1231,33 +1468,31 @@ out: return ret; } -static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) -__releases(fiq->lock) +static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) { - unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + unsigned int queue_id; struct virtio_fs *fs; - struct fuse_req *req; struct virtio_fs_vq *fsvq; int ret; - WARN_ON(list_empty(&fiq->pending)); - req = list_last_entry(&fiq->pending, struct fuse_req, list); + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + clear_bit(FR_PENDING, &req->flags); - list_del_init(&req->list); - WARN_ON(!list_empty(&fiq->pending)); - spin_unlock(&fiq->lock); fs = fiq->priv; + queue_id = VQ_REQUEST + fs->mq_map[raw_smp_processor_id()]; - pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", - __func__, req->in.h.opcode, req->in.h.unique, + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u queue_id %u\n", + __func__, req->in.h.opcode, req->in.h.unique, req->in.h.nodeid, req->in.h.len, - fuse_len_args(req->args->out_numargs, req->args->out_args)); + fuse_len_args(req->args->out_numargs, req->args->out_args), + queue_id); fsvq = &fs->vqs[queue_id]; ret = virtio_fs_enqueue_req(fsvq, req, false); if (ret < 0) { - if (ret == -ENOMEM || ret == -ENOSPC) { + if (ret == -ENOSPC) { /* * Virtqueue full. Retry submission from worker * context as we might be holding fc->bg_lock. @@ -1265,8 +1500,6 @@ __releases(fiq->lock) spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->queued_reqs); inc_in_flight_req(fsvq); - schedule_delayed_work(&fsvq->dispatch_work, - msecs_to_jiffies(1)); spin_unlock(&fsvq->lock); return; } @@ -1276,17 +1509,17 @@ __releases(fiq->lock) /* Can't end request in submission context. Use a worker */ spin_lock(&fsvq->lock); list_add_tail(&req->list, &fsvq->end_reqs); - schedule_delayed_work(&fsvq->dispatch_work, 0); + schedule_work(&fsvq->dispatch_work); spin_unlock(&fsvq->lock); return; } } static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { - .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, - .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, - .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, - .release = virtio_fs_fiq_release, + .send_forget = virtio_fs_send_forget, + .send_interrupt = virtio_fs_send_interrupt, + .send_req = virtio_fs_send_req, + .release = virtio_fs_fiq_release, }; static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) @@ -1430,6 +1663,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. @@ -1458,6 +1694,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fc->delete_stale = true; fc->auto_submounts = true; fc->sync_fs = true; + fc->use_pages_for_kvec_io = true; /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, @@ -1486,9 +1723,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) out_err: kfree(fc); - mutex_lock(&virtio_fs_mutex); virtio_fs_put(fs); - mutex_unlock(&virtio_fs_mutex); return err; } @@ -1518,23 +1753,59 @@ static struct file_system_type virtio_fs_type = { .name = "virtiofs", .init_fs_context = virtio_fs_init_fs_context, .kill_sb = virtio_kill_sb, + .fs_flags = FS_ALLOW_IDMAP, }; +static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) +{ + const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj); + + add_uevent_var(env, "TAG=%s", fs->tag); + return 0; +} + +static const struct kset_uevent_ops virtio_fs_uevent_ops = { + .uevent = virtio_fs_uevent, +}; + +static int __init virtio_fs_sysfs_init(void) +{ + virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops, + fs_kobj); + if (!virtio_fs_kset) + return -ENOMEM; + return 0; +} + +static void virtio_fs_sysfs_exit(void) +{ + kset_unregister(virtio_fs_kset); + virtio_fs_kset = NULL; +} + static int __init virtio_fs_init(void) { int ret; - ret = register_virtio_driver(&virtio_fs_driver); + ret = virtio_fs_sysfs_init(); if (ret < 0) return ret; + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + goto sysfs_exit; + ret = register_filesystem(&virtio_fs_type); - if (ret < 0) { - unregister_virtio_driver(&virtio_fs_driver); - return ret; - } + if (ret < 0) + goto unregister_virtio_driver; return 0; + +unregister_virtio_driver: + unregister_virtio_driver(&virtio_fs_driver); +sysfs_exit: + virtio_fs_sysfs_exit(); + return ret; } module_init(virtio_fs_init); @@ -1542,6 +1813,7 @@ static void __exit virtio_fs_exit(void) { unregister_filesystem(&virtio_fs_type); unregister_virtio_driver(&virtio_fs_driver); + virtio_fs_sysfs_exit(); } module_exit(virtio_fs_exit); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 690b9aadceaa..9f568d345c51 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = { .set = fuse_xattr_set, }; -const struct xattr_handler *fuse_xattr_handlers[] = { +const struct xattr_handler * const fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; |