diff options
Diffstat (limited to 'fs/fuse')
-rw-r--r-- | fs/fuse/Kconfig | 13 | ||||
-rw-r--r-- | fs/fuse/Makefile | 1 | ||||
-rw-r--r-- | fs/fuse/control.c | 30 | ||||
-rw-r--r-- | fs/fuse/dax.c | 44 | ||||
-rw-r--r-- | fs/fuse/dev.c | 480 | ||||
-rw-r--r-- | fs/fuse/dev_uring.c | 1368 | ||||
-rw-r--r-- | fs/fuse/dev_uring_i.h | 211 | ||||
-rw-r--r-- | fs/fuse/dir.c | 198 | ||||
-rw-r--r-- | fs/fuse/file.c | 843 | ||||
-rw-r--r-- | fs/fuse/fuse_dev_i.h | 69 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 100 | ||||
-rw-r--r-- | fs/fuse/inode.c | 91 | ||||
-rw-r--r-- | fs/fuse/ioctl.c | 8 | ||||
-rw-r--r-- | fs/fuse/readdir.c | 40 | ||||
-rw-r--r-- | fs/fuse/sysctl.c | 26 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 15 | ||||
-rw-r--r-- | fs/fuse/xattr.c | 7 |
17 files changed, 2625 insertions, 919 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 8674dbfbe59d..a774166264de 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -2,6 +2,7 @@ config FUSE_FS tristate "FUSE (Filesystem in Userspace) support" select FS_POSIX_ACL + select FS_IOMAP help With FUSE it is possible to implement a fully functional filesystem in a userspace program. @@ -63,3 +64,15 @@ config FUSE_PASSTHROUGH to be performed directly on a backing file. If you want to allow passthrough operations, answer Y. + +config FUSE_IO_URING + bool "FUSE communication over io-uring" + default y + depends on FUSE_FS + depends on IO_URING + help + This allows sending FUSE requests over the io-uring interface and + also adds request core affinity. + + If you want to allow fuse server/client communication through io-uring, + answer Y diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 2c372180d631..3f0f312a31c1 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -15,5 +15,6 @@ fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o fuse-$(CONFIG_SYSCTL) += sysctl.o +fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 2a730d88cc3b..bb407705603c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/fs_context.h> +#include <linux/namei.h> #define FUSE_CTL_SUPER_MAGIC 0x65735543 @@ -212,7 +213,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct dentry *dentry; struct inode *inode; - BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES); dentry = d_alloc_name(parent, name); if (!dentry) return NULL; @@ -236,8 +236,6 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_private = fc; d_add(dentry, inode); - fc->ctl_dentry[fc->ctl_ndents++] = dentry; - return dentry; } @@ -280,27 +278,29 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return -ENOMEM; } +static void remove_one(struct dentry *dentry) +{ + d_inode(dentry)->i_private = NULL; +} + /* * Remove a connection from the control filesystem (if it exists). * Caller must hold fuse_mutex */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - int i; + struct dentry *dentry; + char name[32]; if (!fuse_control_sb || fc->no_control) return; - for (i = fc->ctl_ndents - 1; i >= 0; i--) { - struct dentry *dentry = fc->ctl_dentry[i]; - d_inode(dentry)->i_private = NULL; - if (!i) { - /* Get rid of submounts: */ - d_invalidate(dentry); - } - dput(dentry); + sprintf(name, "%u", fc->dev); + dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root); + if (!IS_ERR(dentry)) { + simple_recursive_removal(dentry, remove_one); + dput(dentry); // paired with lookup_noperm_positive_unlocked() } - drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) @@ -346,12 +346,8 @@ static int fuse_ctl_init_fs_context(struct fs_context *fsc) static void fuse_ctl_kill_sb(struct super_block *sb) { - struct fuse_conn *fc; - mutex_lock(&fuse_mutex); fuse_control_sb = NULL; - list_for_each_entry(fc, &fuse_conn_list, entry) - fc->ctl_ndents = 0; mutex_unlock(&fuse_mutex); kill_litter_super(sb); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 9abbc2f2894f..ac6d4c1064cc 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -10,7 +10,6 @@ #include <linux/dax.h> #include <linux/uio.h> #include <linux/pagemap.h> -#include <linux/pfn_t.h> #include <linux/iomap.h> #include <linux/interval_tree.h> @@ -240,11 +239,12 @@ static int fuse_send_removemapping(struct inode *inode, args.opcode = FUSE_REMOVEMAPPING; args.nodeid = fi->nodeid; - args.in_numargs = 2; - args.in_args[0].size = sizeof(*inargp); - args.in_args[0].value = inargp; - args.in_args[1].size = inargp->count * sizeof(*remove_one); - args.in_args[1].value = remove_one; + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = sizeof(*inargp); + args.in_args[1].value = inargp; + args.in_args[2].size = inargp->count * sizeof(*remove_one); + args.in_args[2].value = remove_one; return fuse_simple_request(fm, &args); } @@ -665,36 +665,12 @@ static void fuse_wait_dax_page(struct inode *inode) filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with mapping->invalidate_lock held exclusively */ -static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, - loff_t start, loff_t end) -{ - struct page *page; - - page = dax_layout_busy_page_range(inode->i_mapping, start, end); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, fuse_wait_dax_page(inode)); -} - -/* dmap_end == 0 leads to unmapping of whole file */ +/* Should be called with mapping->invalidate_lock held exclusively. */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { - bool retry; - int ret; - - do { - retry = false; - ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, - dmap_end); - } while (ret == 0 && retry); - - return ret; + return dax_break_layout(inode, dmap_start, dmap_end, + fuse_wait_dax_page); } ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -780,7 +756,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - pfn_t pfn; + unsigned long pfn; int error = 0; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_conn_dax *fcd = fc->dax; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 27ccae63495d..e80cd8f2c049 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -6,7 +6,9 @@ See the file COPYING. */ +#include "dev_uring_i.h" #include "fuse_i.h" +#include "fuse_dev_i.h" #include <linux/init.h> #include <linux/module.h> @@ -21,6 +23,7 @@ #include <linux/swap.h> #include <linux/splice.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -28,21 +31,100 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); -/* Ordinary requests have even IDs, while interrupts IDs are odd */ -#define FUSE_INT_REQ_BIT (1ULL << 0) -#define FUSE_REQ_ID_STEP (1ULL << 1) - static struct kmem_cache *fuse_req_cachep; -static void end_requests(struct list_head *head); +const unsigned long fuse_timeout_timer_freq = + secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ); -static struct fuse_dev *fuse_get_dev(struct file *file) +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) { - /* - * Lockless access is OK, because file->private data is set - * once during mount and is valid until the file is released. - */ - return READ_ONCE(file->private_data); + struct fuse_req *req; + + req = list_first_entry_or_null(list, struct fuse_req, list); + if (!req) + return false; + return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); +} + +static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +{ + int i; + + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + if (fuse_request_expired(fc, &processing[i])) + return true; + + return false; +} + +/* + * Check if any requests aren't being completed by the time the request timeout + * elapses. To do so, we: + * - check the fiq pending list + * - check the bg queue + * - check the fpq io and processing lists + * + * To make this fast, we only check against the head request on each list since + * these are generally queued in order of creation time (eg newer requests get + * queued to the tail). We might miss a few edge cases (eg requests transitioning + * between lists, re-sent requests at the head of the pending list having a + * later creation time than other requests on that list, etc.) but that is fine + * since if the request never gets fulfilled, it will eventually be caught. + */ +void fuse_check_timeout(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct fuse_conn *fc = container_of(dwork, struct fuse_conn, + timeout.work); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_dev *fud; + struct fuse_pqueue *fpq; + bool expired = false; + + if (!atomic_read(&fc->num_waiting)) + goto out; + + spin_lock(&fiq->lock); + expired = fuse_request_expired(fc, &fiq->pending); + spin_unlock(&fiq->lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->bg_lock); + expired = fuse_request_expired(fc, &fc->bg_queue); + spin_unlock(&fc->bg_lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + list_for_each_entry(fud, &fc->devices, entry) { + fpq = &fud->pq; + spin_lock(&fpq->lock); + if (fuse_request_expired(fc, &fpq->io) || + fuse_fpq_processing_expired(fc, fpq->processing)) { + spin_unlock(&fpq->lock); + spin_unlock(&fc->lock); + goto abort_conn; + } + + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + if (fuse_uring_request_expired(fc)) + goto abort_conn; + +out: + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); + return; + +abort_conn: + fuse_abort_conn(fc); } static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) @@ -53,6 +135,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); req->fm = fm; + req->create_time = jiffies; } static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) @@ -89,7 +172,8 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { - return !fc->initialized || (for_background && fc->blocked); + return !fc->initialized || (for_background && fc->blocked) || + (fc->io_uring && fc->connected && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) @@ -234,7 +318,7 @@ u64 fuse_get_unique(struct fuse_iqueue *fiq) } EXPORT_SYMBOL_GPL(fuse_get_unique); -static unsigned int fuse_req_hash(u64 unique) +unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } @@ -250,7 +334,8 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } -static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget) +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget) { spin_lock(&fiq->lock); if (fiq->connected) { @@ -263,7 +348,7 @@ static void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_li } } -static void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (list_empty(&req->intr_entry)) { @@ -418,6 +503,24 @@ static int queue_interrupt(struct fuse_req *req) return 0; } +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock) +{ + spin_lock(lock); + if (test_bit(FR_PENDING, &req->flags)) { + /* + * FR_PENDING does not get cleared as the request will end + * up in destruction anyway. + */ + list_del(&req->list); + spin_unlock(lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return true; + } + spin_unlock(lock); + return false; +} + static void request_wait_answer(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; @@ -439,22 +542,20 @@ static void request_wait_answer(struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { + bool removed; + /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; + if (test_bit(FR_URING, &req->flags)) + removed = fuse_uring_remove_pending_req(req); + else + removed = fuse_remove_pending_req(req, &fiq->lock); + if (removed) return; - } - spin_unlock(&fiq->lock); } /* @@ -580,7 +681,25 @@ ssize_t __fuse_simple_request(struct mnt_idmap *idmap, return ret; } -static bool fuse_request_queue_background(struct fuse_req *req) +#ifdef CONFIG_FUSE_IO_URING +static bool fuse_request_queue_background_uring(struct fuse_conn *fc, + struct fuse_req *req) +{ + struct fuse_iqueue *fiq = &fc->iq; + + req->in.h.unique = fuse_get_unique(fiq); + req->in.h.len = sizeof(struct fuse_in_header) + + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); + + return fuse_uring_queue_bq_req(req); +} +#endif + +/* + * @return true if queued + */ +static int fuse_request_queue_background(struct fuse_req *req) { struct fuse_mount *fm = req->fm; struct fuse_conn *fc = fm->fc; @@ -592,6 +711,12 @@ static bool fuse_request_queue_background(struct fuse_req *req) atomic_inc(&fc->num_waiting); } __set_bit(FR_ISREPLY, &req->flags); + +#ifdef CONFIG_FUSE_IO_URING + if (fuse_uring_ready(fc)) + return fuse_request_queue_background_uring(fc, req); +#endif + spin_lock(&fc->bg_lock); if (likely(fc->connected)) { fc->num_background++; @@ -692,22 +817,8 @@ static int unlock_request(struct fuse_req *req) return err; } -struct fuse_copy_state { - int write; - struct fuse_req *req; - struct iov_iter *iter; - struct pipe_buffer *pipebufs; - struct pipe_buffer *currbuf; - struct pipe_inode_info *pipe; - unsigned long nr_segs; - struct page *pg; - unsigned len; - unsigned offset; - unsigned move_pages:1; -}; - -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct iov_iter *iter) +void fuse_copy_init(struct fuse_copy_state *cs, bool write, + struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); cs->write = write; @@ -814,6 +925,9 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) *size -= ncpy; cs->len -= ncpy; cs->offset += ncpy; + if (cs->is_uring) + cs->ring.copied_sz += ncpy; + return ncpy; } @@ -836,10 +950,16 @@ static int fuse_check_folio(struct folio *folio) return 0; } -static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +/* + * Attempt to steal a page from the splice() pipe and move it into the + * pagecache. If successful, the pointer in @pagep will be updated. The + * folio that was originally in @pagep will lose a reference and the new + * folio returned in @pagep will carry a reference. + */ +static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop) { int err; - struct folio *oldfolio = page_folio(*pagep); + struct folio *oldfolio = *foliop; struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; @@ -860,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) cs->pipebufs++; cs->nr_segs--; - if (cs->len != PAGE_SIZE) + if (cs->len != folio_size(oldfolio)) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) @@ -906,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = &newfolio->page; + *foliop = newfolio; spin_unlock(&cs->req->waitq.lock); if (err) { @@ -939,8 +1059,8 @@ out_fallback: goto out_put_old; } -static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, - unsigned offset, unsigned count) +static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, + unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; @@ -948,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; - get_page(page); + folio_get(folio); err = unlock_request(cs->req); if (err) { - put_page(page); + folio_put(folio); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; - buf->page = page; + buf->page = &folio->page; buf->offset = offset; buf->len = count; @@ -970,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, } /* - * Copy a page in the request to/from the userspace buffer. Must be + * Copy a folio in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, - unsigned offset, unsigned count, int zeroing) +static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop, + unsigned offset, unsigned count, int zeroing) { int err; - struct page *page = *pagep; + struct folio *folio = *foliop; + size_t size; - if (page && zeroing && count < PAGE_SIZE) - clear_highpage(page); + if (folio) { + size = folio_size(folio); + if (zeroing && count < size) + folio_zero_range(folio, 0, size); + } while (count) { - if (cs->write && cs->pipebufs && page) { + if (cs->write && cs->pipebufs && folio) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. @@ -993,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (err) return err; } else { - return fuse_ref_page(cs, page, offset, count); + return fuse_ref_folio(cs, folio, offset, count); } } else if (!cs->len) { - if (cs->move_pages && page && - offset == 0 && count == PAGE_SIZE) { - err = fuse_try_move_page(cs, pagep); + if (cs->move_folios && folio && + offset == 0 && count == size) { + err = fuse_try_move_folio(cs, foliop); if (err <= 0) return err; } else { @@ -1007,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, return err; } } - if (page) { - void *mapaddr = kmap_local_page(page); - void *buf = mapaddr + offset; - offset += fuse_copy_do(cs, &buf, &count); + if (folio) { + void *mapaddr = kmap_local_folio(folio, offset); + void *buf = mapaddr; + unsigned int copy = count; + unsigned int bytes_copied; + + if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset)) + copy = PAGE_SIZE - offset_in_page(offset); + + bytes_copied = fuse_copy_do(cs, &buf, ©); kunmap_local(mapaddr); + offset += bytes_copied; + count -= bytes_copied; } else offset += fuse_copy_do(cs, NULL, &count); } - if (page && !cs->write) - flush_dcache_page(page); + if (folio && !cs->write) + flush_dcache_folio(folio); return 0; } -/* Copy pages in the request to/from userspace buffer */ -static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, - int zeroing) +/* Copy folios in the request to/from userspace buffer */ +static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) { unsigned i; struct fuse_req *req = cs->req; @@ -1032,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); - struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; - - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing); if (err) return err; nbytes -= count; - - /* - * fuse_copy_page may have moved a page from a pipe instead of - * copying into our given page, so update the folios if it was - * replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); } return 0; } @@ -1068,9 +1189,9 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) } /* Copy request arguments to/from userspace buffer */ -static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, - unsigned argpages, struct fuse_arg *args, - int zeroing) +int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, + unsigned argpages, struct fuse_arg *args, + int zeroing) { int err = 0; unsigned i; @@ -1078,7 +1199,7 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); + err = fuse_copy_folios(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } @@ -1419,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!user_backed_iter(to)) return -EINVAL; - fuse_copy_init(&cs, 1, to); + fuse_copy_init(&cs, true, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } @@ -1442,14 +1563,14 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, 1, NULL); + fuse_copy_init(&cs, true, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; - if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -1525,14 +1646,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1542,13 +1659,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1573,14 +1695,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1590,13 +1708,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1665,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - struct page *page; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_grab_folio(mapping, index); err = PTR_ERR(folio); if (IS_ERR(folio)) goto out_iput; - page = &folio->page; - this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_page(cs, &page, offset, this_num, 0); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + + err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && - (this_num == folio_size(folio) || file_size == end)) { - folio_zero_segment(folio, this_num, folio_size(folio)); + (nr_bytes == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, nr_bytes, folio_size(folio)); folio_mark_uptodate(folio); } folio_unlock(folio); @@ -1687,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (err) goto out_iput; - num -= this_num; + num -= nr_bytes; offset = 0; - index++; + index += nr_pages; } err = 0; @@ -1728,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages, cur_pages = 0; + unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1746,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); + num = min(num, num_pages << PAGE_SHIFT); args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); @@ -1760,37 +1887,42 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, args = &ap->args; args->nodeid = outarg->nodeid; args->opcode = FUSE_NOTIFY_REPLY; - args->in_numargs = 2; + args->in_numargs = 3; args->in_pages = true; args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && cur_pages < num_pages) { + while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min(folio_size(folio) - folio_offset, num); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].offset = offset; - ap->descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = nr_bytes; ap->num_folios++; - cur_pages++; offset = 0; - num -= this_num; - total_len += this_num; - index++; + num -= nr_bytes; + total_len += nr_bytes; + index += nr_pages; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; - args->in_args[0].size = sizeof(ra->inarg); - args->in_args[0].value = &ra->inarg; - args->in_args[1].size = total_len; + fuse_set_zero_arg0(args); + args->in_args[1].size = sizeof(ra->inarg); + args->in_args[1].value = &ra->inarg; + args->in_args[2].size = total_len; err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) @@ -1885,7 +2017,7 @@ static void fuse_resend(struct fuse_conn *fc) spin_unlock(&fiq->lock); list_for_each_entry(req, &to_queue, list) clear_bit(FR_PENDING, &req->flags); - end_requests(&to_queue); + fuse_dev_end_requests(&to_queue); return; } /* iq and pq requests are both oldest to newest */ @@ -1899,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { - /* Don't try to move pages (yet) */ - cs->move_pages = 0; + /* Don't try to move folios (yet) */ + cs->move_folios = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -1927,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -1934,7 +2082,7 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique) { unsigned int hash = fuse_req_hash(unique); struct fuse_req *req; @@ -1946,10 +2094,17 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, - unsigned nbytes) +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned nbytes) { - unsigned reqsize = sizeof(struct fuse_out_header); + + unsigned int reqsize = 0; + + /* + * Uring has all headers separated from args - args is payload only + */ + if (!cs->is_uring) + reqsize = sizeof(struct fuse_out_header); reqsize += fuse_len_args(args->out_numargs, args->out_args); @@ -2011,7 +2166,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_lock(&fpq->lock); req = NULL; if (fpq->connected) - req = request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); + req = fuse_request_find(fpq, oh.unique & ~FUSE_INT_REQ_BIT); err = -ENOENT; if (!req) { @@ -2044,12 +2199,12 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = 0; + cs->move_folios = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; else - err = copy_out_args(cs, req->args, nbytes); + err = fuse_copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); @@ -2082,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!user_backed_iter(from)) return -EINVAL; - fuse_copy_init(&cs, 0, from); + fuse_copy_init(&cs, false, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } @@ -2091,7 +2246,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - unsigned int head, tail, mask, count; + unsigned int head, tail, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -2108,8 +2263,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - count = head - tail; + count = pipe_occupancy(head, tail); bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { @@ -2119,8 +2273,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx != head && rem < len; idx++) - rem += pipe->bufs[idx & mask].len; + for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++) + rem += pipe_buf(pipe, idx)->len; ret = -EINVAL; if (rem < len) @@ -2131,10 +2285,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - if (WARN_ON(nbuf >= count || tail == head)) + if (WARN_ON(nbuf >= count || pipe_empty(head, tail))) goto out_free; - ibuf = &pipe->bufs[tail & mask]; + ibuf = pipe_buf(pipe, tail); obuf = &bufs[nbuf]; if (rem >= ibuf->len) { @@ -2157,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, 0, NULL); + fuse_copy_init(&cs, false, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = 1; + cs.move_folios = true; ret = fuse_dev_do_write(fud, &cs, len); @@ -2204,7 +2358,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct list_head *head) +void fuse_dev_end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2260,6 +2414,9 @@ void fuse_abort_conn(struct fuse_conn *fc) LIST_HEAD(to_end); unsigned int i; + if (fc->timeout.req_timeout) + cancel_delayed_work(&fc->timeout.work); + /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); fc->connected = 0; @@ -2307,7 +2464,13 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); + + /* + * fc->lock must not be taken to avoid conflicts with io-uring + * locks + */ + fuse_uring_abort(fc); } else { spin_unlock(&fc->lock); } @@ -2319,6 +2482,8 @@ void fuse_wait_aborted(struct fuse_conn *fc) /* matches implicit memory barrier in fuse_drop_waiting() */ smp_mb(); wait_event(fc->blocked_waitq, atomic_read(&fc->num_waiting) == 0); + + fuse_uring_wait_stopped_queues(fc); } int fuse_dev_release(struct inode *inode, struct file *file) @@ -2337,7 +2502,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(&to_end); + fuse_dev_end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { @@ -2463,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2475,6 +2651,12 @@ const struct file_operations fuse_dev_operations = { .fasync = fuse_dev_fasync, .unlocked_ioctl = fuse_dev_ioctl, .compat_ioctl = compat_ptr_ioctl, +#ifdef CONFIG_FUSE_IO_URING + .uring_cmd = fuse_uring_cmd, +#endif +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c new file mode 100644 index 000000000000..249b210becb1 --- /dev/null +++ b/fs/fuse/dev_uring.c @@ -0,0 +1,1368 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#include "fuse_i.h" +#include "dev_uring_i.h" +#include "fuse_dev_i.h" + +#include <linux/fs.h> +#include <linux/io_uring/cmd.h> + +static bool __read_mostly enable_uring; +module_param(enable_uring, bool, 0644); +MODULE_PARM_DESC(enable_uring, + "Enable userspace communication through io-uring"); + +#define FUSE_URING_IOV_SEGS 2 /* header and payload */ + + +bool fuse_uring_enabled(void) +{ + return enable_uring; +} + +struct fuse_uring_pdu { + struct fuse_ring_ent *ent; +}; + +static const struct fuse_iqueue_ops fuse_io_uring_ops; + +static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_ent *ring_ent) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + pdu->ent = ring_ent; +} + +static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) +{ + struct fuse_uring_pdu *pdu = + io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); + + return pdu->ent; +} + +static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_held(&queue->lock); + lockdep_assert_held(&fc->bg_lock); + + /* + * Allow one bg request per queue, ignoring global fc limits. + * This prevents a single queue from consuming all resources and + * eliminates the need for remote queue wake-ups when global + * limits are met but this queue has no more waiting requests. + */ + while ((fc->active_background < fc->max_background || + !queue->active_background) && + (!list_empty(&queue->fuse_req_bg_queue))) { + struct fuse_req *req; + + req = list_first_entry(&queue->fuse_req_bg_queue, + struct fuse_req, list); + fc->active_background++; + queue->active_background++; + + list_move_tail(&req->list, &queue->fuse_req_queue); + } +} + +static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, + int error) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + + lockdep_assert_not_held(&queue->lock); + spin_lock(&queue->lock); + ent->fuse_req = NULL; + if (test_bit(FR_BACKGROUND, &req->flags)) { + queue->active_background--; + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + } + + spin_unlock(&queue->lock); + + if (error) + req->out.h.error = error; + + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); +} + +/* Abort all list queued request on the given ring queue */ +static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) +{ + struct fuse_req *req; + LIST_HEAD(req_list); + + spin_lock(&queue->lock); + list_for_each_entry(req, &queue->fuse_req_queue, list) + clear_bit(FR_PENDING, &req->flags); + list_splice_init(&queue->fuse_req_queue, &req_list); + spin_unlock(&queue->lock); + + /* must not hold queue lock to avoid order issues with fi->lock */ + fuse_dev_end_requests(&req_list); +} + +void fuse_uring_abort_end_requests(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_queue *queue; + struct fuse_conn *fc = ring->fc; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + queue->stopped = true; + + WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); + spin_lock(&queue->lock); + spin_lock(&fc->bg_lock); + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + spin_unlock(&queue->lock); + fuse_uring_abort_end_queue_requests(queue); + } +} + +static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_ring_ent *ent; + struct fuse_req *req; + + ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); + if (!ent) + return false; + + req = ent->fuse_req; + + return time_is_before_jiffies(req->create_time + + fc->timeout.req_timeout); +} + +bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + int qid; + + if (!ring) + return false; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + spin_lock(&queue->lock); + if (fuse_request_expired(fc, &queue->fuse_req_queue) || + fuse_request_expired(fc, &queue->fuse_req_bg_queue) || + ent_list_request_expired(fc, &queue->ent_w_req_queue) || + ent_list_request_expired(fc, &queue->ent_in_userspace)) { + spin_unlock(&queue->lock); + return true; + } + spin_unlock(&queue->lock); + } + + return false; +} + +void fuse_uring_destruct(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + int qid; + + if (!ring) + return; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + struct fuse_ring_ent *ent, *next; + + if (!queue) + continue; + + WARN_ON(!list_empty(&queue->ent_avail_queue)); + WARN_ON(!list_empty(&queue->ent_w_req_queue)); + WARN_ON(!list_empty(&queue->ent_commit_queue)); + WARN_ON(!list_empty(&queue->ent_in_userspace)); + + list_for_each_entry_safe(ent, next, &queue->ent_released, + list) { + list_del_init(&ent->list); + kfree(ent); + } + + kfree(queue->fpq.processing); + kfree(queue); + ring->queues[qid] = NULL; + } + + kfree(ring->queues); + kfree(ring); + fc->ring = NULL; +} + +/* + * Basic ring setup for this connection based on the provided configuration + */ +static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) +{ + struct fuse_ring *ring; + size_t nr_queues = num_possible_cpus(); + struct fuse_ring *res = NULL; + size_t max_payload_size; + + ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); + if (!ring) + return NULL; + + ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), + GFP_KERNEL_ACCOUNT); + if (!ring->queues) + goto out_err; + + max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); + max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); + + spin_lock(&fc->lock); + if (fc->ring) { + /* race, another thread created the ring in the meantime */ + spin_unlock(&fc->lock); + res = fc->ring; + goto out_err; + } + + init_waitqueue_head(&ring->stop_waitq); + + ring->nr_queues = nr_queues; + ring->fc = fc; + ring->max_payload_sz = max_payload_size; + smp_store_release(&fc->ring, ring); + + spin_unlock(&fc->lock); + return ring; + +out_err: + kfree(ring->queues); + kfree(ring); + return res; +} + +static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, + int qid) +{ + struct fuse_conn *fc = ring->fc; + struct fuse_ring_queue *queue; + struct list_head *pq; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); + if (!queue) + return NULL; + pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); + if (!pq) { + kfree(queue); + return NULL; + } + + queue->qid = qid; + queue->ring = ring; + spin_lock_init(&queue->lock); + + INIT_LIST_HEAD(&queue->ent_avail_queue); + INIT_LIST_HEAD(&queue->ent_commit_queue); + INIT_LIST_HEAD(&queue->ent_w_req_queue); + INIT_LIST_HEAD(&queue->ent_in_userspace); + INIT_LIST_HEAD(&queue->fuse_req_queue); + INIT_LIST_HEAD(&queue->fuse_req_bg_queue); + INIT_LIST_HEAD(&queue->ent_released); + + queue->fpq.processing = pq; + fuse_pqueue_init(&queue->fpq); + + spin_lock(&fc->lock); + if (ring->queues[qid]) { + spin_unlock(&fc->lock); + kfree(queue->fpq.processing); + kfree(queue); + return ring->queues[qid]; + } + + /* + * write_once and lock as the caller mostly doesn't take the lock at all + */ + WRITE_ONCE(ring->queues[qid], queue); + spin_unlock(&fc->lock); + + return queue; +} + +static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) +{ + clear_bit(FR_SENT, &req->flags); + req->out.h.error = -ECONNABORTED; + fuse_request_end(req); +} + +/* + * Release a request/entry on connection tear down + */ +static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) +{ + struct fuse_req *req; + struct io_uring_cmd *cmd; + + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + req = ent->fuse_req; + ent->fuse_req = NULL; + if (req) { + /* remove entry from queue->fpq->processing */ + list_del_init(&req->list); + } + + /* + * The entry must not be freed immediately, due to access of direct + * pointer access of entries through IO_URING_F_CANCEL - there is a risk + * of race between daemon termination (which triggers IO_URING_F_CANCEL + * and accesses entries without checking the list state first + */ + list_move(&ent->list, &queue->ent_released); + ent->state = FRRS_RELEASED; + spin_unlock(&queue->lock); + + if (cmd) + io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); + + if (req) + fuse_uring_stop_fuse_req_end(req); +} + +static void fuse_uring_stop_list_entries(struct list_head *head, + struct fuse_ring_queue *queue, + enum fuse_ring_req_state exp_state) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent, *next; + ssize_t queue_refs = SSIZE_MAX; + LIST_HEAD(to_teardown); + + spin_lock(&queue->lock); + list_for_each_entry_safe(ent, next, head, list) { + if (ent->state != exp_state) { + pr_warn("entry teardown qid=%d state=%d expected=%d", + queue->qid, ent->state, exp_state); + continue; + } + + ent->state = FRRS_TEARDOWN; + list_move(&ent->list, &to_teardown); + } + spin_unlock(&queue->lock); + + /* no queue lock to avoid lock order issues */ + list_for_each_entry_safe(ent, next, &to_teardown, list) { + fuse_uring_entry_teardown(ent); + queue_refs = atomic_dec_return(&ring->queue_refs); + WARN_ON_ONCE(queue_refs < 0); + } +} + +static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) +{ + fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, + FRRS_USERSPACE); + fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, + FRRS_AVAILABLE); +} + +/* + * Log state debug info + */ +static void fuse_uring_log_ent_state(struct fuse_ring *ring) +{ + int qid; + struct fuse_ring_ent *ent; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = ring->queues[qid]; + + if (!queue) + continue; + + spin_lock(&queue->lock); + /* + * Log entries from the intermediate queue, the other queues + * should be empty + */ + list_for_each_entry(ent, &queue->ent_w_req_queue, list) { + pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + list_for_each_entry(ent, &queue->ent_commit_queue, list) { + pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", + ring, qid, ent, ent->state); + } + spin_unlock(&queue->lock); + } + ring->stop_debug_log = 1; +} + +static void fuse_uring_async_stop_queues(struct work_struct *work) +{ + int qid; + struct fuse_ring *ring = + container_of(work, struct fuse_ring, async_teardown_work.work); + + /* XXX code dup */ + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + /* + * Some ring entries might be in the middle of IO operations, + * i.e. in process to get handled by file_operations::uring_cmd + * or on the way to userspace - we could handle that with conditions in + * run time code, but easier/cleaner to have an async tear down handler + * If there are still queue references left + */ + if (atomic_read(&ring->queue_refs) > 0) { + if (time_after(jiffies, + ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) + fuse_uring_log_ent_state(ring); + + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Stop the ring queues + */ +void fuse_uring_stop_queues(struct fuse_ring *ring) +{ + int qid; + + for (qid = 0; qid < ring->nr_queues; qid++) { + struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); + + if (!queue) + continue; + + fuse_uring_teardown_entries(queue); + } + + if (atomic_read(&ring->queue_refs) > 0) { + ring->teardown_time = jiffies; + INIT_DELAYED_WORK(&ring->async_teardown_work, + fuse_uring_async_stop_queues); + schedule_delayed_work(&ring->async_teardown_work, + FUSE_URING_TEARDOWN_INTERVAL); + } else { + wake_up_all(&ring->stop_waitq); + } +} + +/* + * Handle IO_URING_F_CANCEL, typically should come on daemon termination. + * + * Releasing the last entry should trigger fuse_dev_release() if + * the daemon was terminated + */ +static void fuse_uring_cancel(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue; + bool need_cmd_done = false; + + /* + * direct access on ent - it must not be destructed as long as + * IO_URING_F_CANCEL might come up + */ + queue = ent->queue; + spin_lock(&queue->lock); + if (ent->state == FRRS_AVAILABLE) { + ent->state = FRRS_USERSPACE; + list_move_tail(&ent->list, &queue->ent_in_userspace); + need_cmd_done = true; + ent->cmd = NULL; + } + spin_unlock(&queue->lock); + + if (need_cmd_done) { + /* no queue lock to avoid lock order issues */ + io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); + } +} + +static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_ring_ent *ring_ent) +{ + uring_cmd_set_ring_ent(cmd, ring_ent); + io_uring_cmd_mark_cancelable(cmd, issue_flags); +} + +/* + * Checks for errors and stores it into the request + */ +static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, + struct fuse_req *req, + struct fuse_conn *fc) +{ + int err; + + err = -EINVAL; + if (oh->unique == 0) { + /* Not supported through io-uring yet */ + pr_warn_once("notify through fuse-io-uring not supported\n"); + goto err; + } + + if (oh->error <= -ERESTARTSYS || oh->error > 0) + goto err; + + if (oh->error) { + err = oh->error; + goto err; + } + + err = -ENOENT; + if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { + pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", + req->in.h.unique, + oh->unique & ~FUSE_INT_REQ_BIT); + goto err; + } + + /* + * Is it an interrupt reply ID? + * XXX: Not supported through fuse-io-uring yet, it should not even + * find the request - should not happen. + */ + WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); + + err = 0; +err: + return err; +} + +static int fuse_uring_copy_from_ring(struct fuse_ring *ring, + struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct iov_iter iter; + int err; + struct fuse_uring_ent_in_out ring_in_out; + + err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, + sizeof(ring_in_out)); + if (err) + return -EFAULT; + + err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, + &iter); + if (err) + return err; + + fuse_copy_init(&cs, false, &iter); + cs.is_uring = true; + cs.req = req; + + return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); +} + + /* + * Copy data from the req to the ring buffer + */ +static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, + struct fuse_ring_ent *ent) +{ + struct fuse_copy_state cs; + struct fuse_args *args = req->args; + struct fuse_in_arg *in_args = args->in_args; + int num_args = args->in_numargs; + int err; + struct iov_iter iter; + struct fuse_uring_ent_in_out ent_in_out = { + .flags = 0, + .commit_id = req->in.h.unique, + }; + + err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); + if (err) { + pr_info_ratelimited("fuse: Import of user buffer failed\n"); + return err; + } + + fuse_copy_init(&cs, true, &iter); + cs.is_uring = true; + cs.req = req; + + if (num_args > 0) { + /* + * Expectation is that the first argument is the per op header. + * Some op code have that as zero size. + */ + if (args->in_args[0].size > 0) { + err = copy_to_user(&ent->headers->op_in, in_args->value, + in_args->size); + if (err) { + pr_info_ratelimited( + "Copying the header failed.\n"); + return -EFAULT; + } + } + in_args++; + num_args--; + } + + /* copy the payload */ + err = fuse_copy_args(&cs, num_args, args->in_pages, + (struct fuse_arg *)in_args, 0); + if (err) { + pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); + return err; + } + + ent_in_out.payload_sz = cs.ring.copied_sz; + err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, + sizeof(ent_in_out)); + return err ? -EFAULT : 0; +} + +static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + int err; + + err = -EIO; + if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { + pr_err("qid=%d ring-req=%p invalid state %d on send\n", + queue->qid, ent, ent->state); + return err; + } + + err = -EINVAL; + if (WARN_ON(req->in.h.unique == 0)) + return err; + + /* copy the request */ + err = fuse_uring_args_to_ring(ring, req, ent); + if (unlikely(err)) { + pr_info_ratelimited("Copy to ring failed: %d\n", err); + return err; + } + + /* copy fuse_in_header */ + err = copy_to_user(&ent->headers->in_out, &req->in.h, + sizeof(req->in.h)); + if (err) { + err = -EFAULT; + return err; + } + + return 0; +} + +static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + int err; + + err = fuse_uring_copy_to_ring(ent, req); + if (!err) + set_bit(FR_SENT, &req->flags); + else + fuse_uring_req_end(ent, req, err); + + return err; +} + +/* + * Write data to the ring buffer and send the request to userspace, + * userspace will read it + * This is comparable with classical read(/dev/fuse) + */ +static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, + struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + int err; + struct io_uring_cmd *cmd; + + err = fuse_uring_prepare_send(ent, req); + if (err) + return err; + + spin_lock(&queue->lock); + cmd = ent->cmd; + ent->cmd = NULL; + ent->state = FRRS_USERSPACE; + list_move_tail(&ent->list, &queue->ent_in_userspace); + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, 0, 0, issue_flags); + return 0; +} + +/* + * Make a ring entry available for fuse_req assignment + */ +static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue) +{ + WARN_ON_ONCE(!ent->cmd); + list_move(&ent->list, &queue->ent_avail_queue); + ent->state = FRRS_AVAILABLE; +} + +/* Used to find the request on SQE commit */ +static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_pqueue *fpq = &queue->fpq; + unsigned int hash; + + req->ring_entry = ent; + hash = fuse_req_hash(req->in.h.unique); + list_move_tail(&req->list, &fpq->processing[hash]); +} + +/* + * Assign a fuse queue entry to the given entry + */ +static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, + struct fuse_req *req) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && + ent->state != FRRS_COMMIT)) { + pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, + ent->state); + } + + clear_bit(FR_PENDING, &req->flags); + ent->fuse_req = req; + ent->state = FRRS_FUSE_REQ; + list_move_tail(&ent->list, &queue->ent_w_req_queue); + fuse_uring_add_to_pq(ent, req); +} + +/* Fetch the next fuse request if available */ +static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) + __must_hold(&queue->lock) +{ + struct fuse_req *req; + struct fuse_ring_queue *queue = ent->queue; + struct list_head *req_queue = &queue->fuse_req_queue; + + lockdep_assert_held(&queue->lock); + + /* get and assign the next entry while it is still holding the lock */ + req = list_first_entry_or_null(req_queue, struct fuse_req, list); + if (req) + fuse_uring_add_req_to_ring_ent(ent, req); + + return req; +} + +/* + * Read data from the ring buffer, which user space has written to + * This is comparible with handling of classical write(/dev/fuse). + * Also make the ring request available again for new fuse requests. + */ +static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, + unsigned int issue_flags) +{ + struct fuse_ring *ring = ent->queue->ring; + struct fuse_conn *fc = ring->fc; + ssize_t err = 0; + + err = copy_from_user(&req->out.h, &ent->headers->in_out, + sizeof(req->out.h)); + if (err) { + req->out.h.error = -EFAULT; + goto out; + } + + err = fuse_uring_out_header_has_err(&req->out.h, req, fc); + if (err) { + /* req->out.h.error already set */ + goto out; + } + + err = fuse_uring_copy_from_ring(ring, req, ent); +out: + fuse_uring_req_end(ent, req, err); +} + +/* + * Get the next fuse req and send it + */ +static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, + struct fuse_ring_queue *queue, + unsigned int issue_flags) +{ + int err; + struct fuse_req *req; + +retry: + spin_lock(&queue->lock); + fuse_uring_ent_avail(ent, queue); + req = fuse_uring_ent_assign_req(ent); + spin_unlock(&queue->lock); + + if (req) { + err = fuse_uring_send_next_to_ring(ent, req, issue_flags); + if (err) + goto retry; + } +} + +static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) +{ + struct fuse_ring_queue *queue = ent->queue; + + lockdep_assert_held(&queue->lock); + + if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) + return -EIO; + + ent->state = FRRS_COMMIT; + list_move(&ent->list, &queue->ent_commit_queue); + + return 0; +} + +/* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ +static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, + struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring_ent *ent; + int err; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + uint64_t commit_id = READ_ONCE(cmd_req->commit_id); + unsigned int qid = READ_ONCE(cmd_req->qid); + struct fuse_pqueue *fpq; + struct fuse_req *req; + + err = -ENOTCONN; + if (!ring) + return err; + + if (qid >= ring->nr_queues) + return -EINVAL; + + queue = ring->queues[qid]; + if (!queue) + return err; + fpq = &queue->fpq; + + if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) + return err; + + spin_lock(&queue->lock); + /* Find a request based on the unique ID of the fuse request + * This should get revised, as it needs a hash calculation and list + * search. And full struct fuse_pqueue is needed (memory overhead). + * As well as the link from req to ring_ent. + */ + req = fuse_request_find(fpq, commit_id); + err = -ENOENT; + if (!req) { + pr_info("qid=%d commit_id %llu not found\n", queue->qid, + commit_id); + spin_unlock(&queue->lock); + return err; + } + list_del_init(&req->list); + ent = req->ring_entry; + req->ring_entry = NULL; + + err = fuse_ring_ent_set_commit(ent); + if (err != 0) { + pr_info_ratelimited("qid=%d commit_id %llu state %d", + queue->qid, commit_id, ent->state); + spin_unlock(&queue->lock); + req->out.h.error = err; + clear_bit(FR_SENT, &req->flags); + fuse_request_end(req); + return err; + } + + ent->cmd = cmd; + spin_unlock(&queue->lock); + + /* without the queue lock, as other locks are taken */ + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + fuse_uring_commit(ent, req, issue_flags); + + /* + * Fetching the next request is absolutely required as queued + * fuse requests would otherwise not get processed - committing + * and fetching is done in one step vs legacy fuse, which has separated + * read (fetch request) and write (commit result). + */ + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return 0; +} + +static bool is_ring_ready(struct fuse_ring *ring, int current_qid) +{ + int qid; + struct fuse_ring_queue *queue; + bool ready = true; + + for (qid = 0; qid < ring->nr_queues && ready; qid++) { + if (current_qid == qid) + continue; + + queue = ring->queues[qid]; + if (!queue) { + ready = false; + break; + } + + spin_lock(&queue->lock); + if (list_empty(&queue->ent_avail_queue)) + ready = false; + spin_unlock(&queue->lock); + } + + return ready; +} + +/* + * fuse_uring_req_fetch command handling + */ +static void fuse_uring_do_register(struct fuse_ring_ent *ent, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + struct fuse_ring *ring = queue->ring; + struct fuse_conn *fc = ring->fc; + struct fuse_iqueue *fiq = &fc->iq; + + fuse_uring_prepare_cancel(cmd, issue_flags, ent); + + spin_lock(&queue->lock); + ent->cmd = cmd; + fuse_uring_ent_avail(ent, queue); + spin_unlock(&queue->lock); + + if (!ring->ready) { + bool ready = is_ring_ready(ring, queue->qid); + + if (ready) { + WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); + WRITE_ONCE(ring->ready, true); + wake_up_all(&fc->blocked_waitq); + } + } +} + +/* + * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] + * the payload + */ +static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, + struct iovec iov[FUSE_URING_IOV_SEGS]) +{ + struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); + struct iov_iter iter; + ssize_t ret; + + if (sqe->len != FUSE_URING_IOV_SEGS) + return -EINVAL; + + /* + * Direction for buffer access will actually be READ and WRITE, + * using write for the import should include READ access as well. + */ + ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, + FUSE_URING_IOV_SEGS, &iov, &iter); + if (ret < 0) + return ret; + + return 0; +} + +static struct fuse_ring_ent * +fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, + struct fuse_ring_queue *queue) +{ + struct fuse_ring *ring = queue->ring; + struct fuse_ring_ent *ent; + size_t payload_size; + struct iovec iov[FUSE_URING_IOV_SEGS]; + int err; + + err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); + if (err) { + pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", + err); + return ERR_PTR(err); + } + + err = -EINVAL; + if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { + pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); + return ERR_PTR(err); + } + + payload_size = iov[1].iov_len; + if (payload_size < ring->max_payload_sz) { + pr_info_ratelimited("Invalid req payload len %zu\n", + payload_size); + return ERR_PTR(err); + } + + err = -ENOMEM; + ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); + if (!ent) + return ERR_PTR(err); + + INIT_LIST_HEAD(&ent->list); + + ent->queue = queue; + ent->headers = iov[0].iov_base; + ent->payload = iov[1].iov_base; + + atomic_inc(&ring->queue_refs); + return ent; +} + +/* + * Register header and payload buffer with the kernel and puts the + * entry as "ready to get fuse requests" on the queue + */ +static int fuse_uring_register(struct io_uring_cmd *cmd, + unsigned int issue_flags, struct fuse_conn *fc) +{ + const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); + struct fuse_ring *ring = smp_load_acquire(&fc->ring); + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent; + int err; + unsigned int qid = READ_ONCE(cmd_req->qid); + + err = -ENOMEM; + if (!ring) { + ring = fuse_uring_create(fc); + if (!ring) + return err; + } + + if (qid >= ring->nr_queues) { + pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); + return -EINVAL; + } + + queue = ring->queues[qid]; + if (!queue) { + queue = fuse_uring_create_queue(ring, qid); + if (!queue) + return err; + } + + /* + * The created queue above does not need to be destructed in + * case of entry errors below, will be done at ring destruction time. + */ + + ent = fuse_uring_create_ring_ent(cmd, queue); + if (IS_ERR(ent)) + return PTR_ERR(ent); + + fuse_uring_do_register(ent, cmd, issue_flags); + + return 0; +} + +/* + * Entry function from io_uring to handle the given passthrough command + * (op code IORING_OP_URING_CMD) + */ +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct fuse_dev *fud; + struct fuse_conn *fc; + u32 cmd_op = cmd->cmd_op; + int err; + + if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { + fuse_uring_cancel(cmd, issue_flags); + return 0; + } + + /* This extra SQE size holds struct fuse_uring_cmd_req */ + if (!(issue_flags & IO_URING_F_SQE128)) + return -EINVAL; + + fud = fuse_get_dev(cmd->file); + if (!fud) { + pr_info_ratelimited("No fuse device found\n"); + return -ENOTCONN; + } + fc = fud->fc; + + /* Once a connection has io-uring enabled on it, it can't be disabled */ + if (!enable_uring && !fc->io_uring) { + pr_info_ratelimited("fuse-io-uring is disabled\n"); + return -EOPNOTSUPP; + } + + if (fc->aborted) + return -ECONNABORTED; + if (!fc->connected) + return -ENOTCONN; + + /* + * fuse_uring_register() needs the ring to be initialized, + * we need to know the max payload size + */ + if (!fc->initialized) + return -EAGAIN; + + switch (cmd_op) { + case FUSE_IO_URING_CMD_REGISTER: + err = fuse_uring_register(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", + err); + fc->io_uring = 0; + wake_up_all(&fc->blocked_waitq); + return err; + } + break; + case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: + err = fuse_uring_commit_fetch(cmd, issue_flags, fc); + if (err) { + pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", + err); + return err; + } + break; + default: + return -EINVAL; + } + + return -EIOCBQUEUED; +} + +static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, + ssize_t ret, unsigned int issue_flags) +{ + struct fuse_ring_queue *queue = ent->queue; + + spin_lock(&queue->lock); + ent->state = FRRS_USERSPACE; + list_move_tail(&ent->list, &queue->ent_in_userspace); + ent->cmd = NULL; + spin_unlock(&queue->lock); + + io_uring_cmd_done(cmd, ret, 0, issue_flags); +} + +/* + * This prepares and sends the ring request in fuse-uring task context. + * User buffers are not mapped yet - the application does not have permission + * to write to it - this has to be executed in ring task context. + */ +static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); + struct fuse_ring_queue *queue = ent->queue; + int err; + + if (!(issue_flags & IO_URING_F_TASK_DEAD)) { + err = fuse_uring_prepare_send(ent, ent->fuse_req); + if (err) { + fuse_uring_next_fuse_req(ent, queue, issue_flags); + return; + } + } else { + err = -ECANCELED; + } + + fuse_uring_send(ent, cmd, err, issue_flags); +} + +static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) +{ + unsigned int qid; + struct fuse_ring_queue *queue; + + qid = task_cpu(current); + + if (WARN_ONCE(qid >= ring->nr_queues, + "Core number (%u) exceeds nr queues (%zu)\n", qid, + ring->nr_queues)) + qid = 0; + + queue = ring->queues[qid]; + WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); + + return queue; +} + +static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) +{ + struct io_uring_cmd *cmd = ent->cmd; + + uring_cmd_set_ring_ent(cmd, ent); + io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); +} + +/* queue a fuse request and send it if a ring entry is available */ +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + int err; + + err = -EINVAL; + queue = fuse_uring_task_to_queue(ring); + if (!queue) + goto err; + + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + spin_lock(&queue->lock); + err = -ENOTCONN; + if (unlikely(queue->stopped)) + goto err_unlock; + + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + if (ent) + fuse_uring_add_req_to_ring_ent(ent, req); + else + list_add_tail(&req->list, &queue->fuse_req_queue); + spin_unlock(&queue->lock); + + if (ent) + fuse_uring_dispatch_ent(ent); + + return; + +err_unlock: + spin_unlock(&queue->lock); +err: + req->out.h.error = err; + clear_bit(FR_PENDING, &req->flags); + fuse_request_end(req); +} + +bool fuse_uring_queue_bq_req(struct fuse_req *req) +{ + struct fuse_conn *fc = req->fm->fc; + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + struct fuse_ring_ent *ent = NULL; + + queue = fuse_uring_task_to_queue(ring); + if (!queue) + return false; + + spin_lock(&queue->lock); + if (unlikely(queue->stopped)) { + spin_unlock(&queue->lock); + return false; + } + + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; + list_add_tail(&req->list, &queue->fuse_req_bg_queue); + + ent = list_first_entry_or_null(&queue->ent_avail_queue, + struct fuse_ring_ent, list); + spin_lock(&fc->bg_lock); + fc->num_background++; + if (fc->num_background == fc->max_background) + fc->blocked = 1; + fuse_uring_flush_bg(queue); + spin_unlock(&fc->bg_lock); + + /* + * Due to bg_queue flush limits there might be other bg requests + * in the queue that need to be handled first. Or no further req + * might be available. + */ + req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, + list); + if (ent && req) { + fuse_uring_add_req_to_ring_ent(ent, req); + spin_unlock(&queue->lock); + + fuse_uring_dispatch_ent(ent); + } else { + spin_unlock(&queue->lock); + } + + return true; +} + +bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + struct fuse_ring_queue *queue = req->ring_queue; + + return fuse_remove_pending_req(req, &queue->lock); +} + +static const struct fuse_iqueue_ops fuse_io_uring_ops = { + /* should be send over io-uring as enhancement */ + .send_forget = fuse_dev_queue_forget, + + /* + * could be send over io-uring, but interrupts should be rare, + * no need to make the code complex + */ + .send_interrupt = fuse_dev_queue_interrupt, + .send_req = fuse_uring_queue_fuse_req, +}; diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h new file mode 100644 index 000000000000..51a563922ce1 --- /dev/null +++ b/fs/fuse/dev_uring_i.h @@ -0,0 +1,211 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (c) 2023-2024 DataDirect Networks. + */ + +#ifndef _FS_FUSE_DEV_URING_I_H +#define _FS_FUSE_DEV_URING_I_H + +#include "fuse_i.h" + +#ifdef CONFIG_FUSE_IO_URING + +#define FUSE_URING_TEARDOWN_TIMEOUT (5 * HZ) +#define FUSE_URING_TEARDOWN_INTERVAL (HZ/20) + +enum fuse_ring_req_state { + FRRS_INVALID = 0, + + /* The ring entry received from userspace and it is being processed */ + FRRS_COMMIT, + + /* The ring entry is waiting for new fuse requests */ + FRRS_AVAILABLE, + + /* The ring entry got assigned a fuse req */ + FRRS_FUSE_REQ, + + /* The ring entry is in or on the way to user space */ + FRRS_USERSPACE, + + /* The ring entry is in teardown */ + FRRS_TEARDOWN, + + /* The ring entry is released, but not freed yet */ + FRRS_RELEASED, +}; + +/** A fuse ring entry, part of the ring queue */ +struct fuse_ring_ent { + /* userspace buffer */ + struct fuse_uring_req_header __user *headers; + void __user *payload; + + /* the ring queue that owns the request */ + struct fuse_ring_queue *queue; + + /* fields below are protected by queue->lock */ + + struct io_uring_cmd *cmd; + + struct list_head list; + + enum fuse_ring_req_state state; + + struct fuse_req *fuse_req; +}; + +struct fuse_ring_queue { + /* + * back pointer to the main fuse uring structure that holds this + * queue + */ + struct fuse_ring *ring; + + /* queue id, corresponds to the cpu core */ + unsigned int qid; + + /* + * queue lock, taken when any value in the queue changes _and_ also + * a ring entry state changes. + */ + spinlock_t lock; + + /* available ring entries (struct fuse_ring_ent) */ + struct list_head ent_avail_queue; + + /* + * entries in the process of being committed or in the process + * to be sent to userspace + */ + struct list_head ent_w_req_queue; + struct list_head ent_commit_queue; + + /* entries in userspace */ + struct list_head ent_in_userspace; + + /* entries that are released */ + struct list_head ent_released; + + /* fuse requests waiting for an entry slot */ + struct list_head fuse_req_queue; + + /* background fuse requests */ + struct list_head fuse_req_bg_queue; + + struct fuse_pqueue fpq; + + unsigned int active_background; + + bool stopped; +}; + +/** + * Describes if uring is for communication and holds alls the data needed + * for uring communication + */ +struct fuse_ring { + /* back pointer */ + struct fuse_conn *fc; + + /* number of ring queues */ + size_t nr_queues; + + /* maximum payload/arg size */ + size_t max_payload_sz; + + struct fuse_ring_queue **queues; + + /* + * Log ring entry states on stop when entries cannot be released + */ + unsigned int stop_debug_log : 1; + + wait_queue_head_t stop_waitq; + + /* async tear down */ + struct delayed_work async_teardown_work; + + /* log */ + unsigned long teardown_time; + + atomic_t queue_refs; + + bool ready; +}; + +bool fuse_uring_enabled(void); +void fuse_uring_destruct(struct fuse_conn *fc); +void fuse_uring_stop_queues(struct fuse_ring *ring); +void fuse_uring_abort_end_requests(struct fuse_ring *ring); +int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); +void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_uring_queue_bq_req(struct fuse_req *req); +bool fuse_uring_remove_pending_req(struct fuse_req *req); +bool fuse_uring_request_expired(struct fuse_conn *fc); + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring == NULL) + return; + + if (atomic_read(&ring->queue_refs) > 0) { + fuse_uring_abort_end_requests(ring); + fuse_uring_stop_queues(ring); + } +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + + if (ring) + wait_event(ring->stop_waitq, + atomic_read(&ring->queue_refs) == 0); +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return fc->ring && fc->ring->ready; +} + +#else /* CONFIG_FUSE_IO_URING */ + +static inline void fuse_uring_destruct(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_enabled(void) +{ + return false; +} + +static inline void fuse_uring_abort(struct fuse_conn *fc) +{ +} + +static inline void fuse_uring_wait_stopped_queues(struct fuse_conn *fc) +{ +} + +static inline bool fuse_uring_ready(struct fuse_conn *fc) +{ + return false; +} + +static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + return false; +} + +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + return false; +} + +#endif /* CONFIG_FUSE_IO_URING */ + +#endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 494ac372ace0..2d817d7cab26 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -175,9 +175,12 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; - args->in_numargs = 1; - args->in_args[0].size = name->len + 1; - args->in_args[0].value = name->name; + args->in_numargs = 3; + fuse_set_zero_arg0(args); + args->in_args[1].size = name->len; + args->in_args[1].value = name->name; + args->in_args[2].size = 1; + args->in_args[2].value = ""; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; @@ -192,14 +195,19 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ -static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) +static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, + struct dentry *entry, unsigned int flags) { struct inode *inode; - struct dentry *parent; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(dir->i_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -227,11 +235,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fm->fc); - parent = dget_parent(entry); - fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), - &entry->d_name, &outarg); + fuse_lookup_init(fm->fc, &args, get_node_id(dir), + name, &outarg); ret = fuse_simple_request(fm, &args); - dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; @@ -265,9 +271,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { - parent = dget_parent(entry); - fuse_advise_use_readdirplus(d_inode(parent)); - dput(parent); + fuse_advise_use_readdirplus(dir); } } ret = 1; @@ -320,9 +324,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) /* Create the submount */ mnt = fc_mount(fsc); - if (!IS_ERR(mnt)) - mntget(mnt); - put_fs_context(fsc); return mnt; } @@ -337,13 +338,6 @@ const struct dentry_operations fuse_dentry_operations = { .d_automount = fuse_dentry_automount, }; -const struct dentry_operations fuse_root_dentry_operations = { -#if BITS_PER_LONG < 64 - .d_init = fuse_dentry_init, - .d_release = fuse_dentry_release, -#endif -}; - int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || @@ -371,7 +365,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = NULL; err = -ENAMETOOLONG; - if (name->len > FUSE_NAME_MAX) + if (name->len > fm->fc->name_max) goto out; @@ -416,16 +410,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -447,6 +445,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -467,29 +466,29 @@ static int get_security_context(struct dentry *entry, umode_t mode, { struct fuse_secctx *fctx; struct fuse_secctx_header *header; - void *ctx = NULL, *ptr; - u32 ctxlen, total_len = sizeof(*header); + struct lsm_context lsmctx = { }; + void *ptr; + u32 total_len = sizeof(*header); int err, nr_ctx = 0; - const char *name; + const char *name = NULL; size_t namelen; err = security_dentry_init_security(entry, mode, &entry->d_name, - &name, &ctx, &ctxlen); - if (err) { - if (err != -EOPNOTSUPP) - goto out_err; - /* No LSM is supporting this security hook. Ignore error */ - ctxlen = 0; - ctx = NULL; - } + &name, &lsmctx); - if (ctxlen) { + /* If no LSM is supporting this security hook ignore error */ + if (err && err != -EOPNOTSUPP) + goto out_err; + + if (lsmctx.len) { nr_ctx = 1; namelen = strlen(name) + 1; err = -EIO; - if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || ctxlen > S32_MAX)) + if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || + lsmctx.len > S32_MAX)) goto out_err; - total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + ctxlen); + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + + lsmctx.len); } err = -ENOMEM; @@ -502,19 +501,20 @@ static int get_security_context(struct dentry *entry, umode_t mode, ptr += sizeof(*header); if (nr_ctx) { fctx = ptr; - fctx->size = ctxlen; + fctx->size = lsmctx.len; ptr += sizeof(*fctx); strcpy(ptr, name); ptr += namelen; - memcpy(ptr, ctx, ctxlen); + memcpy(ptr, lsmctx.context, lsmctx.len); } ext->size = total_len; ext->value = header; err = 0; out_err: - kfree(ctx); + if (nr_ctx) + security_release_secctx(&lsmctx); return err; } @@ -619,7 +619,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -629,11 +628,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -702,6 +703,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -781,22 +783,24 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, - struct fuse_args *args, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) - return -EIO; + return ERR_PTR(-EIO); + + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); if (!forget) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); @@ -826,29 +830,46 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) - return PTR_ERR(d); + return d; if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); - dput(d); } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); } fuse_dir_changed(dir); - return 0; + return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); - return err; + return ERR_PTR(err); +} + +static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + /* + * Note that when creating anything other than a directory we + * can be sure create_new_entry() will NOT return an alternate + * dentry as d_splice_alias() only returns an alternate dentry + * for directories. So we don't need to check for that case + * when passing back the result. + */ + WARN_ON_ONCE(S_ISDIR(mode)); + + return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -871,7 +892,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(idmap, fm, &args, dir, entry, mode); + return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, @@ -898,8 +919,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return err; } -static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -928,12 +949,13 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; - args.in_numargs = 2; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; - args.in_args[1].size = len; - args.in_args[1].value = link; - return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); + args.in_numargs = 3; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.in_args[2].size = len; + args.in_args[2].value = link; + return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -992,9 +1014,10 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1015,9 +1038,10 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); - args.in_numargs = 1; - args.in_args[0].size = entry->d_name.len + 1; - args.in_args[0].value = entry->d_name.name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); @@ -1120,6 +1144,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); + if (fm->fc->no_link) + goto out; + memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; @@ -1128,12 +1155,18 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); + err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + fm->fc->no_link = 1; +out: + if (fm->fc->no_link) + return -EPERM; + return err; } @@ -1586,10 +1619,10 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct folio *folio) +static int fuse_readlink_folio(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, @@ -1633,7 +1666,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) @@ -1644,13 +1677,13 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!folio) goto out_err; - err = fuse_readlink_page(inode, folio); + err = fuse_readlink_folio(inode, folio); if (err) { folio_put(folio); goto out_err; } - set_delayed_call(callback, page_put_link, &folio->page); + set_delayed_call(callback, page_put_link, folio); return folio_address(folio); @@ -1681,6 +1714,8 @@ static int fuse_dir_open(struct inode *inode, struct file *file) */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); + if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; @@ -1918,6 +1953,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1935,7 +1971,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; @@ -2002,6 +2038,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -2027,6 +2065,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); @@ -2232,7 +2278,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, folio); + int err = fuse_readlink_folio(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 88d0946b5bc9..5525a4520b0f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -21,6 +21,7 @@ #include <linux/filelock.h> #include <linux/splice.h> #include <linux/task_io_accounting_ops.h> +#include <linux/iomap.h> static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, unsigned int open_flags, int opcode, @@ -253,7 +254,7 @@ static int fuse_open(struct inode *inode, struct file *file) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -415,89 +416,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -532,10 +455,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -870,12 +789,16 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, } } -static int fuse_do_readfolio(struct file *file, struct folio *folio) +static int fuse_do_readfolio(struct file *file, struct folio *folio, + size_t off, size_t len) { struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); - loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + loff_t pos = folio_pos(folio) + off; + struct fuse_folio_desc desc = { + .offset = off, + .length = len, + }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -886,13 +809,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -909,8 +825,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) if (res < desc.length) fuse_short_read(inode, attr_ver, res, &ia.ap); - folio_mark_uptodate(folio); - return 0; } @@ -923,13 +837,26 @@ static int fuse_read_folio(struct file *file, struct folio *folio) if (fuse_is_bad(inode)) goto out; - err = fuse_do_readfolio(file, folio); + err = fuse_do_readfolio(file, folio, 0, folio_size(folio)); + if (!err) + folio_mark_uptodate(folio); + fuse_invalidate_atime(inode); out: folio_unlock(folio); return err; } +static int fuse_iomap_read_folio_range(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, + size_t len) +{ + struct file *file = iter->private; + size_t off = offset_in_folio(folio, pos); + + return fuse_do_readfolio(file, folio, off, len); +} + static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { @@ -955,22 +882,23 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_folios; i++) + for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); + } if (ia->ff) fuse_file_put(ia->ff, false); fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); - /* Currently, all folios in FUSE are one page */ - size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -1003,17 +931,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; + struct folio *folio = NULL; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1031,8 +955,8 @@ static void fuse_readahead(struct readahead_control *rac) while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; - struct folio *folio; unsigned cur_pages = min(max_pages, nr_pages); + unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1044,17 +968,44 @@ static void fuse_readahead(struct readahead_control *rac) ia = fuse_io_alloc(NULL, cur_pages); if (!ia) - return; + break; ap = &ia->ap; - while (ap->num_folios < cur_pages) { - folio = readahead_folio(rac); + while (pages < cur_pages) { + unsigned int folio_pages; + + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + if (!folio) + folio = __readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + if (folio_pages > cur_pages - pages) { + /* + * Large folios belonging to fuse will never + * have more pages than max_pages. + */ + WARN_ON(!pages); + break; + } + ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; + pages += folio_pages; + folio = NULL; } - fuse_send_readpages(ia, rac->file); - nr_pages -= cur_pages; + fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); + nr_pages -= pages; + } + if (folio) { + folio_end_read(folio, false); + folio_put(folio); } } @@ -1172,7 +1123,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1212,32 +1163,28 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, - unsigned int max_pages) + unsigned int max_folios) { struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); - unsigned int nr_pages = 0; size_t count = 0; - int err; + unsigned int num; + int err = 0; + + num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; ap->descs[0].offset = offset; - do { + while (num && ap->num_folios < max_folios) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned int bytes; + unsigned int folio_offset; again: - err = -EFAULT; - if (fault_in_iov_iter_readable(ii, bytes)) - break; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { @@ -1248,29 +1195,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); - tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); + + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { folio_unlock(folio); folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } - err = 0; ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; - nr_pages++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - /* If we copied full page, mark it uptodate */ - if (tmp == PAGE_SIZE) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { @@ -1279,10 +1239,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, ia->write.folio_locked = true; break; } - if (!fc->big_writes) + if (!fc->big_writes || offset != 0) break; - } while (iov_iter_count(ii) && count < fc->max_write && - nr_pages < max_pages && offset == 0); + } return count > 0 ? count : err; } @@ -1431,6 +1390,24 @@ static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive) } } +static const struct iomap_write_ops fuse_iomap_write_ops = { + .read_folio_range = fuse_iomap_read_folio_range, +}; + +static int fuse_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + iomap->type = IOMAP_MAPPED; + iomap->length = length; + iomap->offset = offset; + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, +}; + static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; @@ -1440,6 +1417,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = mapping->host; ssize_t err, count; struct fuse_conn *fc = get_fuse_conn(inode); + bool writeback = false; if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1448,16 +1426,11 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - if (fc->handle_killpriv_v2 && - setattr_should_drop_suidgid(idmap, - file_inode(file))) { - goto writethrough; - } - - return generic_file_write_iter(iocb, from); + if (!fc->handle_killpriv_v2 || + !setattr_should_drop_suidgid(idmap, file_inode(file))) + writeback = true; } -writethrough: inode_lock(inode); err = count = generic_write_checks(iocb, from); @@ -1476,6 +1449,15 @@ writethrough: goto out; written = direct_write_fallback(iocb, from, written, fuse_perform_write(iocb, from)); + } else if (writeback) { + /* + * Use iomap so that we can do granular uptodate reads + * and granular dirty tracking for large folios. + */ + written = iomap_file_buffered_write(iocb, from, + &fuse_iomap_ops, + &fuse_iomap_write_ops, + file); } else { written = fuse_perform_write(iocb, from); } @@ -1541,8 +1523,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, */ struct page **pages = kzalloc(max_pages * sizeof(struct page *), GFP_KERNEL); - if (!pages) - return -ENOMEM; + if (!pages) { + ret = -ENOMEM; + goto out; + } while (nbytes < *nbytesp && nr_pages < max_pages) { unsigned nfolios, i; @@ -1557,18 +1541,22 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, nbytes += ret; - ret += start; - /* Currently, all folios in FUSE are one page */ - nfolios = DIV_ROUND_UP(ret, PAGE_SIZE); + nfolios = DIV_ROUND_UP(ret + start, PAGE_SIZE); - ap->descs[ap->num_folios].offset = start; - fuse_folio_descs_length_init(ap->descs, ap->num_folios, nfolios); - for (i = 0; i < nfolios; i++) - ap->folios[i + ap->num_folios] = page_folio(pages[i]); + for (i = 0; i < nfolios; i++) { + struct folio *folio = page_folio(pages[i]); + unsigned int offset = start + + (folio_page_idx(folio, pages[i]) << PAGE_SHIFT); + unsigned int len = min_t(unsigned int, ret, PAGE_SIZE - start); + + ap->descs[ap->num_folios].offset = offset; + ap->descs[ap->num_folios].length = len; + ap->folios[ap->num_folios] = folio; + start = 0; + ret -= len; + ap->num_folios++; + } - ap->num_folios += nfolios; - ap->descs[ap->num_folios - 1].length -= - (PAGE_SIZE - ret) & (PAGE_SIZE - 1); nr_pages += nfolios; } kfree(pages); @@ -1584,6 +1572,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, else ap->args.out_pages = true; +out: *nbytesp = nbytes; return ret < 0 ? ret : 0; @@ -1622,7 +1611,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1819,38 +1808,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + iomap_finish_folio_write(inode, ap->folios[i], 1); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1861,13 +1846,15 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_args *args = &wpa->ia.ap.args; - /* Currently, all folios in FUSE are one page */ - __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; - int err; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; + + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { @@ -1898,19 +1885,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1938,43 +1914,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -1994,41 +1933,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2062,17 +1966,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; - /* - * Inode is always written before the last reference is dropped and - * hence this should not be reached from reclaim. - * - * Writing back the inode from reclaim can deadlock if the request - * processing itself needs an allocation. Allocations triggering - * reclaim while serving a request can't be prevented, because it can - * involve any number of unrelated userspace processes. - */ - WARN_ON(wbc->for_reclaim); - ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) @@ -2115,22 +2008,20 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index, loff_t offset, unsigned len) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; - ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = PAGE_SIZE; + ap->folios[folio_index] = folio; + ap->descs[folio_index].offset = offset; + ap->descs[folio_index].length = len; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, + size_t offset, struct fuse_file *ff) { struct inode *inode = folio->mapping->host; @@ -2143,7 +2034,7 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return NULL; fuse_writepage_add_to_bucket(fc, wpa); - fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio), 0); + fuse_write_args_fill(&wpa->ia, ff, folio_pos(folio) + offset, 0); wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; wpa->inode = inode; wpa->ia.ff = ff; @@ -2155,74 +2046,28 @@ static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio return wpa; } -static int fuse_writepage_locked(struct folio *folio) -{ - struct address_space *mapping = folio->mapping; - struct inode *inode = mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_writepage_args *wpa; - struct fuse_args_pages *ap; - struct folio *tmp_folio; - struct fuse_file *ff; - int error = -ENOMEM; - - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; - ff = fuse_write_file_get(fi); - if (!ff) - goto err_nofile; - - wpa = fuse_writepage_args_setup(folio, ff); - error = -ENOMEM; - if (!wpa) - goto err_writepage_args; - - ap = &wpa->ia.ap; - ap->num_folios = 1; - - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); - - spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); - list_add_tail(&wpa->queue_entry, &fi->queued_writes); - fuse_flush_writepages(inode); - spin_unlock(&fi->lock); - - folio_end_writeback(folio); - - return 0; - -err_writepage_args: - fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); -err: - mapping_set_error(folio->mapping, error); - return error; -} - struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; - struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; + /* + * nr_bytes won't overflow since fuse_writepage_need_send() caps + * wb requests to never exceed fc->max_pages (which has an upper bound + * of U16_MAX). + */ + unsigned int nr_bytes; }; -static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data, + unsigned int max_pages) { struct fuse_args_pages *ap = &data->wpa->ia.ap; - struct fuse_conn *fc = get_fuse_conn(data->inode); struct folio **folios; struct fuse_folio_desc *descs; unsigned int nfolios = min_t(unsigned int, max_t(unsigned int, data->max_folios * 2, FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); + max_pages); WARN_ON(nfolios <= data->max_folios); folios = fuse_folios_alloc(nfolios, GFP_NOFS, &descs); @@ -2239,319 +2084,162 @@ static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) return true; } -static void fuse_writepages_send(struct fuse_fill_wb_data *data) +static void fuse_writepages_send(struct inode *inode, + struct fuse_fill_wb_data *data) { struct fuse_writepage_args *wpa = data->wpa; - struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } -static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, - struct fuse_args_pages *ap, +static bool fuse_writepage_need_send(struct fuse_conn *fc, loff_t pos, + unsigned len, struct fuse_args_pages *ap, struct fuse_fill_wb_data *data) { - WARN_ON(!ap->num_folios); + struct folio *prev_folio; + struct fuse_folio_desc prev_desc; + unsigned bytes = data->nr_bytes + len; + loff_t prev_pos; - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; + WARN_ON(!ap->num_folios); /* Reached max pages */ - if (ap->num_folios == fc->max_pages) + if ((bytes + PAGE_SIZE - 1) >> PAGE_SHIFT > fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) + if (bytes > fc->max_write) return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + prev_folio = ap->folios[ap->num_folios - 1]; + prev_desc = ap->descs[ap->num_folios - 1]; + prev_pos = folio_pos(prev_folio) + prev_desc.offset + prev_desc.length; + if (prev_pos != pos) return true; /* Need to grow the pages array? If so, did the expansion fail? */ - if (ap->num_folios == data->max_folios && !fuse_pages_realloc(data)) + if (ap->num_folios == data->max_folios && + !fuse_pages_realloc(data, fc->max_pages)) return true; return false; } -static int fuse_writepages_fill(struct folio *folio, - struct writeback_control *wbc, void *_data) +static ssize_t fuse_iomap_writeback_range(struct iomap_writepage_ctx *wpc, + struct folio *folio, u64 pos, + unsigned len, u64 end_pos) { - struct fuse_fill_wb_data *data = _data; + struct fuse_fill_wb_data *data = wpc->wb_ctx; struct fuse_writepage_args *wpa = data->wpa; struct fuse_args_pages *ap = &wpa->ia.ap; - struct inode *inode = data->inode; + struct inode *inode = wpc->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; - int err; + loff_t offset = offset_in_folio(folio, pos); + + WARN_ON_ONCE(!data); if (!data->ff) { - err = -EIO; data->ff = fuse_write_file_get(fi); if (!data->ff) - goto out_unlock; + return -EIO; } - if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { - fuse_writepages_send(data); + if (wpa && fuse_writepage_need_send(fc, pos, len, ap, data)) { + fuse_writepages_send(inode, data); data->wpa = NULL; + data->nr_bytes = 0; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { - err = -ENOMEM; - wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); - goto out_unlock; - } + wpa = fuse_writepage_args_setup(folio, offset, data->ff); + if (!wpa) + return -ENOMEM; fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } - folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + iomap_start_folio_write(inode, folio, 1); + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios, + offset, len); + data->nr_bytes += len; - err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); + + return len; +} + +static int fuse_iomap_writeback_submit(struct iomap_writepage_ctx *wpc, + int error) +{ + struct fuse_fill_wb_data *data = wpc->wb_ctx; + + WARN_ON_ONCE(!data); + + if (data->wpa) { + WARN_ON(!data->wpa->ia.ap.num_folios); + fuse_writepages_send(wpc->inode, data); } -out_unlock: - folio_unlock(folio); - return err; + if (data->ff) + fuse_file_put(data->ff, false); + + return error; } +static const struct iomap_writeback_ops fuse_writeback_ops = { + .writeback_range = fuse_iomap_writeback_range, + .writeback_submit = fuse_iomap_writeback_submit, +}; + static int fuse_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_fill_wb_data data; - int err; + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = inode, + .iomap.type = IOMAP_MAPPED, + .wbc = wbc, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - err = -EIO; if (fuse_is_bad(inode)) - goto out; + return -EIO; if (wbc->sync_mode == WB_SYNC_NONE && fc->num_background >= fc->congestion_threshold) return 0; - data.inode = inode; - data.wpa = NULL; - data.ff = NULL; - - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; - - err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.wpa) { - WARN_ON(!data.wpa->ia.ap.num_folios); - fuse_writepages_send(&data); - } - if (data.ff) - fuse_file_put(data.ff, false); - - kfree(data.orig_folios); -out: - return err; -} - -/* - * It's worthy to make sure that space is reserved on disk for the write, - * but how to implement it without killing performance need more thinking. - */ -static int fuse_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, struct folio **foliop, void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct fuse_conn *fc = get_fuse_conn(file_inode(file)); - struct folio *folio; - loff_t fsize; - int err = -ENOMEM; - - WARN_ON(!fc->writeback_cache); - - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - goto error; - - fuse_wait_on_page_writeback(mapping->host, folio->index); - - if (folio_test_uptodate(folio) || len >= folio_size(folio)) - goto success; - /* - * Check if the start of this folio comes after the end of file, - * in which case the readpage can be optimized away. - */ - fsize = i_size_read(mapping->host); - if (fsize <= folio_pos(folio)) { - size_t off = offset_in_folio(folio, pos); - if (off) - folio_zero_segment(folio, 0, off); - goto success; - } - err = fuse_do_readfolio(file, folio); - if (err) - goto cleanup; -success: - *foliop = folio; - return 0; - -cleanup: - folio_unlock(folio); - folio_put(folio); -error: - return err; -} - -static int fuse_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct folio *folio, void *fsdata) -{ - struct inode *inode = folio->mapping->host; - - /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ - if (!copied) - goto unlock; - - pos += copied; - if (!folio_test_uptodate(folio)) { - /* Zero any unwritten bytes at the end of the page */ - size_t endoff = pos & ~PAGE_MASK; - if (endoff) - folio_zero_segment(folio, endoff, PAGE_SIZE); - folio_mark_uptodate(folio); - } - - if (pos > inode->i_size) - i_size_write(inode, pos); - - folio_mark_dirty(folio); - -unlock: - folio_unlock(folio); - folio_put(folio); - - return copied; + return iomap_writepages(&wpc); } static int fuse_launder_folio(struct folio *folio) { int err = 0; - if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; + struct fuse_fill_wb_data data = {}; + struct iomap_writepage_ctx wpc = { + .inode = folio->mapping->host, + .iomap.type = IOMAP_MAPPED, + .ops = &fuse_writeback_ops, + .wb_ctx = &data, + }; - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); - err = fuse_writepage_locked(folio); + if (folio_clear_dirty_for_io(folio)) { + err = iomap_writeback_folio(&wpc, folio); + err = fuse_iomap_writeback_submit(&wpc, err); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2595,7 +2283,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3189,7 +2877,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } @@ -3402,20 +3090,24 @@ static const struct address_space_operations fuse_file_aops = { .readahead = fuse_readahead, .writepages = fuse_writepages, .launder_folio = fuse_launder_folio, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .is_partially_uptodate = iomap_is_partially_uptodate, .migrate_folio = filemap_migrate_folio, .bmap = fuse_bmap, .direct_IO = fuse_direct_IO, - .write_begin = fuse_write_begin, - .write_end = fuse_write_end, }; void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3423,7 +3115,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h new file mode 100644 index 000000000000..5a9bd771a319 --- /dev/null +++ b/fs/fuse/fuse_dev_i.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> + */ +#ifndef _FS_FUSE_DEV_I_H +#define _FS_FUSE_DEV_I_H + +#include <linux/types.h> + +/* Ordinary requests have even IDs, while interrupts IDs are odd */ +#define FUSE_INT_REQ_BIT (1ULL << 0) +#define FUSE_REQ_ID_STEP (1ULL << 1) + +struct fuse_arg; +struct fuse_args; +struct fuse_pqueue; +struct fuse_req; +struct fuse_iqueue; +struct fuse_forget_link; + +struct fuse_copy_state { + struct fuse_req *req; + struct iov_iter *iter; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; + unsigned long nr_segs; + struct page *pg; + unsigned int len; + unsigned int offset; + bool write:1; + bool move_folios:1; + bool is_uring:1; + struct { + unsigned int copied_sz; /* copied size into the user buffer */ + } ring; +}; + +static inline struct fuse_dev *fuse_get_dev(struct file *file) +{ + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return READ_ONCE(file->private_data); +} + +unsigned int fuse_req_hash(u64 unique); +struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); + +void fuse_dev_end_requests(struct list_head *head); + +void fuse_copy_init(struct fuse_copy_state *cs, bool write, + struct iov_iter *iter); +int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, + unsigned int argpages, struct fuse_arg *args, + int zeroing); +int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, + unsigned int nbytes); +void fuse_dev_queue_forget(struct fuse_iqueue *fiq, + struct fuse_forget_link *forget); +void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); + +#endif + diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 74744c6f2860..ec248d13c8bf 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -38,14 +38,34 @@ /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN -/** It could be as large as PATH_MAX, but would that have any uses? */ -#define FUSE_NAME_MAX 1024 +/** Maximum length of a filename, not including terminating null */ + +/* maximum, small enough for FUSE_MIN_READ_BUFFER*/ +#define FUSE_NAME_LOW_MAX 1024 +/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */ +#define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/* Frequency (in seconds) of request timeout checks, if opted into */ +#define FUSE_TIMEOUT_TIMER_FREQ 15 + +/** Frequency (in jiffies) of request timeout checks, if opted into */ +extern const unsigned long fuse_timeout_timer_freq; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; +/* + * Default timeout (in seconds) for the server to reply to a request + * before the connection is aborted, if no timeout was specified on mount. + */ +extern unsigned int fuse_default_req_timeout; +/* + * Max timeout (in seconds) for the server to reply to a request before + * the connection is aborted. + */ +extern unsigned int fuse_max_req_timeout; /** List of active connections */ extern struct list_head fuse_conn_list; @@ -54,8 +74,8 @@ extern struct list_head fuse_conn_list; extern struct mutex fuse_mutex; /** Module parameters */ -extern unsigned max_user_bgreq; -extern unsigned max_user_congthresh; +extern unsigned int max_user_bgreq; +extern unsigned int max_user_congthresh; /* One forget request */ struct fuse_forget_link { @@ -141,9 +161,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -310,7 +327,7 @@ struct fuse_args { bool is_ext:1; bool is_pinned:1; bool invalidate_vmap:1; - struct fuse_in_arg in_args[3]; + struct fuse_in_arg in_args[4]; struct fuse_arg out_args[2]; void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); /* Used for kvec iter backed by vmalloc address */ @@ -378,6 +395,7 @@ struct fuse_io_priv { * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list * FR_ASYNC: request is asynchronous + * FR_URING: request is handled through fuse-io-uring */ enum fuse_req_flag { FR_ISREPLY, @@ -392,6 +410,7 @@ enum fuse_req_flag { FR_FINISHED, FR_PRIVATE, FR_ASYNC, + FR_URING, }; /** @@ -438,6 +457,13 @@ struct fuse_req { /** fuse_mount this request belongs to */ struct fuse_mount *fm; + +#ifdef CONFIG_FUSE_IO_URING + void *ring_entry; + void *ring_queue; +#endif + /** When (in jiffies) the request was created */ + unsigned long create_time; }; struct fuse_iqueue; @@ -607,6 +633,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ @@ -863,6 +892,12 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Is link not implemented by fs? */ + unsigned int no_link:1; + + /* Use io_uring for communication */ + unsigned int io_uring; + /** Maximum stack depth for passthrough backing files */ int max_stack_depth; @@ -878,12 +913,6 @@ struct fuse_conn { /** Device ID from the root super block */ dev_t dev; - /** Dentries in the control filesystem */ - struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; - - /** number of dentries used in the above array */ - int ctl_ndents; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -893,6 +922,9 @@ struct fuse_conn { /** Version counter for evict inode */ atomic64_t evict_ctr; + /* maximum file name length */ + u32 name_max; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -923,6 +955,20 @@ struct fuse_conn { /** IDR for backing files ids */ struct idr backing_files_map; #endif + +#ifdef CONFIG_FUSE_IO_URING + /** uring connection information*/ + struct fuse_ring *ring; +#endif + + /** Only used if the connection opts into request timeouts */ + struct { + /* Worker for checking if any requests have timed out */ + struct delayed_work work; + + /* Request timeout (in jiffies). 0 = no timeout */ + unsigned int req_timeout; + } timeout; }; /* @@ -947,6 +993,19 @@ struct fuse_mount { struct rcu_head rcu; }; +/* + * Empty header for FUSE opcodes without specific header needs. + * Used as a placeholder in args->in_args[0] for consistency + * across all FUSE operations, simplifying request handling. + */ +struct fuse_zero_header {}; + +static inline void fuse_set_zero_arg0(struct fuse_args *args) +{ + args->in_args[0].size = sizeof(struct fuse_zero_header); + args->in_args[0].value = NULL; +} + static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; @@ -1044,7 +1103,6 @@ static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) extern const struct file_operations fuse_dev_operations; extern const struct dentry_operations fuse_dentry_operations; -extern const struct dentry_operations fuse_root_dentry_operations; /** * Get a filled in inode @@ -1191,6 +1249,9 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/* Check if any requests timed out */ +void fuse_check_timeout(struct work_struct *work); + /** * Invalidate inode attributes */ @@ -1220,6 +1281,11 @@ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** + * Initialize the fuse processing queue + */ +void fuse_pqueue_init(struct fuse_pqueue *fpq); + +/** * Initialize fuse_conn */ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, @@ -1413,9 +1479,9 @@ void fuse_dax_cancel_work(struct fuse_conn *fc); long fuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa); int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa); + struct dentry *dentry, struct file_kattr *fa); /* iomode.c */ int fuse_file_cached_io_open(struct inode *inode, struct fuse_file *ff); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 3ce4f4e81d09..67c2318bfc42 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,7 +7,9 @@ */ #include "fuse_i.h" +#include "dev_uring_i.h" +#include <linux/dax.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/file.h> @@ -36,8 +38,11 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned int fuse_max_pages_limit = 256; +/* default is no timeout */ +unsigned int fuse_default_req_timeout; +unsigned int fuse_max_req_timeout; -unsigned max_user_bgreq; +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -45,7 +50,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -158,6 +163,9 @@ static void fuse_evict_inode(struct inode *inode) /* Will write inode on close/munmap and in all other dirtiers */ WARN_ON(inode->i_state & I_DIRTY_INODE); + if (FUSE_IS_DAX(inode)) + dax_break_layout_final(inode); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { @@ -281,11 +289,6 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, } } - if (attr->blksize != 0) - inode->i_blkbits = ilog2(attr->blksize); - else - inode->i_blkbits = inode->i_sb->s_blocksize_bits; - /* * Don't set the sticky bit in i_mode, unless we want the VFS * to check permissions. This prevents failures due to the @@ -937,7 +940,7 @@ static void fuse_iqueue_init(struct fuse_iqueue *fiq, fiq->priv = priv; } -static void fuse_pqueue_init(struct fuse_pqueue *fpq) +void fuse_pqueue_init(struct fuse_pqueue *fpq) { unsigned int i; @@ -958,6 +961,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -978,6 +982,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; + fc->timeout.req_timeout = 0; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -992,6 +998,8 @@ static void delayed_release(struct rcu_head *p) { struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu); + fuse_uring_destruct(fc); + put_user_ns(fc->user_ns); fc->release(fc); } @@ -1004,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -1026,7 +1036,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -1201,7 +1211,7 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { /* * The default maximum number of async requests is calculated to consume @@ -1222,7 +1232,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } @@ -1254,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } +static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + fc->timeout.req_timeout = secs_to_jiffies(timeout); + INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); +} + +static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout) + return; + + if (!timeout) + timeout = fuse_default_req_timeout; + + if (fuse_max_req_timeout) { + if (timeout) + timeout = min(fuse_max_req_timeout, timeout); + else + timeout = fuse_max_req_timeout; + } + + timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout); + + set_request_timeout(fc, timeout); +} + struct fuse_init_args { struct fuse_args args; struct fuse_init_in in; @@ -1272,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, ok = false; else { unsigned long ra_pages; + unsigned int timeout = 0; process_init_limits(fc, arg); @@ -1335,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && @@ -1387,12 +1433,19 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, else ok = false; } + if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) + fc->io_uring = 1; + + if (flags & FUSE_REQUEST_TIMEOUT) + timeout = arg->request_timeout; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } + init_server_timeout(fc, timeout); + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; @@ -1434,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; @@ -1446,6 +1500,13 @@ void fuse_send_init(struct fuse_mount *fm) if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) flags |= FUSE_PASSTHROUGH; + /* + * This is just an information flag for fuse server. No need to check + * the reply - server is either sending IORING_OP_URING_CMD or not. + */ + if (fuse_uring_enabled()) + flags |= FUSE_OVER_IO_URING; + ia->in.flags = flags; ia->in.flags2 = flags >> 32; @@ -1653,7 +1714,7 @@ static int fuse_fill_super_submount(struct super_block *sb, fi = get_fuse_inode(root); fi->nlookup--; - sb->s_d_op = &fuse_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); sb->s_root = d_make_root(root); if (!sb->s_root) return -ENOMEM; @@ -1788,12 +1849,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); - sb->s_d_op = &fuse_root_dentry_operations; + set_default_d_op(sb, &fuse_dentry_operations); root_dentry = d_make_root(root); if (!root_dentry) goto err_dev_free; - /* Root dentry doesn't have .d_revalidate */ - sb->s_d_op = &fuse_dentry_operations; mutex_lock(&fuse_mutex); err = -EINVAL; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 2d9abf48828f..57032eadca6c 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -502,7 +502,7 @@ static void fuse_priv_ioctl_cleanup(struct inode *inode, struct fuse_file *ff) fuse_file_release(inode, ff, O_RDONLY, NULL, S_ISDIR(inode->i_mode)); } -int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) +int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -536,11 +536,13 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa) cleanup: fuse_priv_ioctl_cleanup(inode, ff); + if (err == -ENOTTY) + err = -EOPNOTSUPP; return err; } int fuse_fileattr_set(struct mnt_idmap *idmap, - struct dentry *dentry, struct fileattr *fa) + struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct fuse_file *ff; @@ -572,5 +574,7 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, cleanup: fuse_priv_ioctl_cleanup(inode, ff); + if (err == -ENOTTY) + err = -EOPNOTSUPP; return err; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 17ce9636a2b1..c2aae2eef086 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx, fuse_add_dirent_to_cache(file, dirent, ctx->pos); return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, - dirent->type); + dirent->type | FILLDIR_FLAG_NOINTR); } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ retry: } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); @@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT); u64 attr_version = 0, evict_ctr = 0; bool locked; - folio = folio_alloc(GFP_KERNEL, 0); - if (!folio) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_folios = 1; - ap->folios = &folio; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(folio_address(folio), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - folio_put(folio); + kvfree(buf); fuse_invalidate_atime(inode); return res; } @@ -419,7 +417,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, if (ff->readdir.pos == ctx->pos) { res = FOUND_SOME; if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) + dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR)) return FOUND_ALL; ctx->pos = dirent->off; } diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c index b272bb333005..e2d921abcb88 100644 --- a/fs/fuse/sysctl.c +++ b/fs/fuse/sysctl.c @@ -13,7 +13,13 @@ static struct ctl_table_header *fuse_table_header; /* Bound by fuse_init_out max_pages, which is a u16 */ static unsigned int sysctl_fuse_max_pages_limit = 65535; -static struct ctl_table fuse_sysctl_table[] = { +/* + * fuse_init_out request timeouts are u16. + * This goes up to ~18 hours, which is plenty for a timeout. + */ +static unsigned int sysctl_fuse_req_timeout_limit = 65535; + +static const struct ctl_table fuse_sysctl_table[] = { { .procname = "max_pages_limit", .data = &fuse_max_pages_limit, @@ -23,6 +29,24 @@ static struct ctl_table fuse_sysctl_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &sysctl_fuse_max_pages_limit, }, + { + .procname = "default_request_timeout", + .data = &fuse_default_req_timeout, + .maxlen = sizeof(fuse_default_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, + { + .procname = "max_request_timeout", + .data = &fuse_max_req_timeout, + .maxlen = sizeof(fuse_max_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, }; int fuse_sysctl_register(void) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 82afe78ec542..c826e7ca49f5 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -9,7 +9,6 @@ #include <linux/pci.h> #include <linux/interrupt.h> #include <linux/group_cpus.h> -#include <linux/pfn_t.h> #include <linux/memremap.h> #include <linux/module.h> #include <linux/virtio.h> @@ -862,7 +861,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work) static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *fs) { const struct cpumask *mask, *masks; - unsigned int q, cpu; + unsigned int q, cpu, nr_masks; /* First attempt to map using existing transport layer affinities * e.g. PCIe MSI-X @@ -882,7 +881,7 @@ static void virtio_fs_map_queues(struct virtio_device *vdev, struct virtio_fs *f return; fallback: /* Attempt to map evenly in groups over the CPUs */ - masks = group_cpus_evenly(fs->num_request_queues); + masks = group_cpus_evenly(fs->num_request_queues, &nr_masks); /* If even this fails we default to all CPUs use first request queue */ if (!masks) { for_each_possible_cpu(cpu) @@ -891,7 +890,7 @@ fallback: } for (q = 0; q < fs->num_request_queues; q++) { - for_each_cpu(cpu, &masks[q]) + for_each_cpu(cpu, &masks[q % nr_masks]) fs->mq_map[cpu] = q + VQ_REQUEST; } kfree(masks); @@ -1008,7 +1007,7 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) */ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, enum dax_access_mode mode, - void **kaddr, pfn_t *pfn) + void **kaddr, unsigned long *pfn) { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); @@ -1017,8 +1016,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, - PFN_DEV | PFN_MAP); + *pfn = fs->window_phys_addr + offset; return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } @@ -1670,6 +1668,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 9f568d345c51..93dfb06b6cea 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -164,9 +164,10 @@ int fuse_removexattr(struct inode *inode, const char *name) args.opcode = FUSE_REMOVEXATTR; args.nodeid = get_node_id(inode); - args.in_numargs = 1; - args.in_args[0].size = strlen(name) + 1; - args.in_args[0].value = name; + args.in_numargs = 2; + fuse_set_zero_arg0(&args); + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_removexattr = 1; |