diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-10 01:48:24 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-10 01:48:24 +0300 |
commit | 5b14671be58d0084e7e2d1cc9c2c36a94467f6e0 (patch) | |
tree | de0d4238d49f55d4ea277b213d52f347a53b2674 | |
parent | 52435c86bf0f5c892804912481af7f1a5b95ff2d (diff) | |
parent | 9b46418c40fe910e6537618f9932a8be78a3dd6c (diff) | |
download | linux-5b14671be58d0084e7e2d1cc9c2c36a94467f6e0.tar.xz |
Merge tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse
Pull fuse updates from Miklos Szeredi:
- Fix a rare deadlock in virtiofs
- Fix st_blocks in writeback cache mode
- Fix wrong checks in splice move causing spurious warnings
- Fix a race between a GETATTR request and a FUSE_NOTIFY_INVAL_INODE
notification
- Use rb-tree instead of linear search for pages currently under
writeout by userspace
- Fix copy_file_range() inconsistencies
* tag 'fuse-update-5.8' of git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse:
fuse: copy_file_range should truncate cache
fuse: fix copy_file_range cache issues
fuse: optimize writepages search
fuse: update attr_version counter on fuse_notify_inval_inode()
fuse: don't check refcount after stealing page
fuse: fix weird page warning
fuse: use dump_page
virtiofs: do not use fuse_fill_super_common() for device installation
fuse: always allow query of st_dev
fuse: always flush dirty data on close(2)
fuse: invalidate inode attr in writeback cache mode
fuse: Update stale comment in queue_interrupt()
fuse: BUG_ON correction in fuse_dev_splice_write()
virtiofs: Add mount option and atime behavior to the doc
virtiofs: schedule blocking async replies in separate worker
-rw-r--r-- | Documentation/filesystems/virtiofs.rst | 14 | ||||
-rw-r--r-- | fs/fuse/dev.c | 14 | ||||
-rw-r--r-- | fs/fuse/dir.c | 12 | ||||
-rw-r--r-- | fs/fuse/file.c | 120 | ||||
-rw-r--r-- | fs/fuse/fuse_i.h | 3 | ||||
-rw-r--r-- | fs/fuse/inode.c | 26 | ||||
-rw-r--r-- | fs/fuse/virtio_fs.c | 115 |
7 files changed, 219 insertions, 85 deletions
diff --git a/Documentation/filesystems/virtiofs.rst b/Documentation/filesystems/virtiofs.rst index e06e4951cb39..fd4d2484e949 100644 --- a/Documentation/filesystems/virtiofs.rst +++ b/Documentation/filesystems/virtiofs.rst @@ -39,6 +39,20 @@ Mount file system with tag ``myfs`` on ``/mnt``: Please see https://virtio-fs.gitlab.io/ for details on how to configure QEMU and the virtiofsd daemon. +Mount options +------------- + +virtiofs supports general VFS mount options, for example, remount, +ro, rw, context, etc. It also supports FUSE mount options. + +atime behavior +^^^^^^^^^^^^^^ + +The atime-related mount options, for example, noatime, strictatime, +are ignored. The atime behavior for virtiofs is the same as the +underlying filesystem of the directory that has been exported +on the host. + Internals ========= Since the virtio-fs device uses the FUSE protocol for file system requests, the diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8ccc97356cb5..02b3c36b3676 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -342,7 +342,7 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) list_add_tail(&req->intr_entry, &fiq->interrupts); /* * Pairs with smp_mb() implied by test_and_set_bit() - * from request_end(). + * from fuse_request_end(). */ smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { @@ -764,16 +764,15 @@ static int fuse_check_page(struct page *page) { if (page_mapcount(page) || page->mapping != NULL || - page_count(page) != 1 || (page->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced | 1 << PG_uptodate | 1 << PG_lru | 1 << PG_active | - 1 << PG_reclaim))) { - pr_warn("trying to steal weird page\n"); - pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + 1 << PG_reclaim | + 1 << PG_waiters))) { + dump_page(page, "fuse: trying to steal weird page"); return 1; } return 0; @@ -1977,8 +1976,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - BUG_ON(nbuf >= pipe->ring_size); - BUG_ON(tail == head); + if (WARN_ON(nbuf >= count || tail == head)) + goto out_free; + ibuf = &pipe->bufs[tail & mask]; obuf = &bufs[nbuf]; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index de1e2fde60bd..26f028bc760b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1689,8 +1689,18 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fc)) { + if (!request_mask) { + /* + * If user explicitly requested *nothing* then don't + * error out, but return st_dev only. + */ + stat->result_mask = 0; + stat->dev = inode->i_sb->s_dev; + return 0; + } return -EACCES; + } return fuse_update_get_attr(inode, NULL, stat, request_mask, flags); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index bac51c32d660..e573b0cd2737 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -357,7 +357,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct list_head writepages_entry; + struct rb_node writepages_entry; struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; @@ -366,17 +366,23 @@ struct fuse_writepage_args { static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_writepage_args *wpa; + struct rb_node *n; + + n = fi->writepages.rb_node; - list_for_each_entry(wpa, &fi->writepages, writepages_entry) { + while (n) { + struct fuse_writepage_args *wpa; pgoff_t curr_index; + wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); WARN_ON(get_fuse_inode(wpa->inode) != fi); curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + wpa->ia.ap.num_pages && - curr_index <= idx_to) { + if (idx_from >= curr_index + wpa->ia.ap.num_pages) + n = n->rb_right; + else if (idx_to < curr_index) + n = n->rb_left; + else return wpa; - } } return NULL; } @@ -445,9 +451,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (is_bad_inode(inode)) return -EIO; - if (fc->no_flush) - return 0; - err = write_inode_now(inode, 1); if (err) return err; @@ -460,6 +463,10 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; + err = 0; + if (fc->no_flush) + goto inval_attr_out; + memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); @@ -475,6 +482,14 @@ static int fuse_flush(struct file *file, fl_owner_t id) fc->no_flush = 1; err = 0; } + +inval_attr_out: + /* + * In memory i_blocks is not maintained by fuse, if writeback cache is + * enabled, i_blocks from cached attr may not be accurate. + */ + if (!err && fc->writeback_cache) + fuse_invalidate_attr(inode); return err; } @@ -712,6 +727,7 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, spin_unlock(&io->lock); ia->ap.args.end = fuse_aio_complete_req; + ia->ap.args.may_block = io->should_dirty; err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); if (err) fuse_aio_complete_req(fc, &ia->ap.args, err); @@ -1570,7 +1586,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&wpa->writepages_entry); + rb_erase(&wpa->writepages_entry, &fi->writepages); for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); @@ -1658,6 +1674,36 @@ __acquires(fi->lock) } } +static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) +{ + pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; + pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + + WARN_ON(!wpa->ia.ap.num_pages); + while (*p) { + struct fuse_writepage_args *curr; + pgoff_t curr_index; + + parent = *p; + curr = rb_entry(parent, struct fuse_writepage_args, + writepages_entry); + WARN_ON(curr->inode != wpa->inode); + curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; + + if (idx_from >= curr_index + curr->ia.ap.num_pages) + p = &(*p)->rb_right; + else if (idx_to < curr_index) + p = &(*p)->rb_left; + else + return (void) WARN_ON(true); + } + + rb_link_node(&wpa->writepages_entry, parent, p); + rb_insert_color(&wpa->writepages_entry, root); +} + static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, int error) { @@ -1676,7 +1722,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, wpa->next = next->next; next->next = NULL; next->ia.ff = fuse_file_get(wpa->ia.ff); - list_add(&next->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, next); /* * Skip fuse_flush_writepages() to make it easy to crop requests @@ -1811,7 +1857,7 @@ static int fuse_writepage_locked(struct page *page) inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1923,10 +1969,10 @@ static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_wpa->writepages_entry); + rb_erase(&new_wpa->writepages_entry, &fi->writepages); old_wpa = fuse_find_writeback(fi, page->index, page->index); if (!old_wpa) { - list_add(&new_wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, new_wpa); spin_unlock(&fi->lock); return false; } @@ -2041,7 +2087,7 @@ static int fuse_writepages_fill(struct page *page, wpa->inode = inode; spin_lock(&fi->lock); - list_add(&wpa->writepages_entry, &fi->writepages); + tree_insert(&fi->writepages, wpa); spin_unlock(&fi->lock); data->wpa = wpa; @@ -3235,13 +3281,11 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; - if (fc->writeback_cache) { - inode_lock(inode_in); - err = fuse_writeback_range(inode_in, pos_in, pos_in + len); - inode_unlock(inode_in); - if (err) - return err; - } + inode_lock(inode_in); + err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); + inode_unlock(inode_in); + if (err) + return err; inode_lock(inode_out); @@ -3249,11 +3293,27 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; - if (fc->writeback_cache) { - err = fuse_writeback_range(inode_out, pos_out, pos_out + len); - if (err) - goto out; - } + /* + * Write out dirty pages in the destination file before sending the COPY + * request to userspace. After the request is completed, truncate off + * pages (including partial ones) from the cache that have been copied, + * since these contain stale data at that point. + * + * This should be mostly correct, but if the COPY writes to partial + * pages (at the start or end) and the parts not covered by the COPY are + * written through a memory map after calling fuse_writeback_range(), + * then these partial page modifications will be lost on truncation. + * + * It is unlikely that someone would rely on such mixed style + * modifications. Yet this does give less guarantees than if the + * copying was performed with write(2). + * + * To fix this a i_mmap_sem style lock could be used to prevent new + * faults while the copy is ongoing. + */ + err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); + if (err) + goto out; if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); @@ -3274,6 +3334,10 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (err) goto out; + truncate_inode_pages_range(inode_out->i_mapping, + ALIGN_DOWN(pos_out, PAGE_SIZE), + ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + if (fc->writeback_cache) { fuse_write_update_size(inode_out, pos_out + outarg.size); file_update_time(file_out); @@ -3351,5 +3415,5 @@ void fuse_init_file_inode(struct inode *inode) INIT_LIST_HEAD(&fi->queued_writes); fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); - INIT_LIST_HEAD(&fi->writepages); + fi->writepages = RB_ROOT; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ca344bf71404..740a8a7d7ae6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -111,7 +111,7 @@ struct fuse_inode { wait_queue_head_t page_waitq; /* List of writepage requestst (pending or sent) */ - struct list_head writepages; + struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -249,6 +249,7 @@ struct fuse_args { bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; + bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 95d712d44ca1..5b4aebf5821f 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -321,6 +321,8 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, loff_t offset, loff_t len) { + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; @@ -329,6 +331,11 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, if (!inode) return -ENOENT; + fi = get_fuse_inode(inode); + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + spin_unlock(&fi->lock); + fuse_invalidate_attr(inode); forget_all_cached_acls(inode); if (offset >= 0) { @@ -1113,7 +1120,7 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { - struct fuse_dev *fud; + struct fuse_dev *fud = NULL; struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; struct dentry *root_dentry; @@ -1155,9 +1162,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fud = fuse_dev_alloc_install(fc); - if (!fud) - goto err; + if (ctx->fudptr) { + err = -ENOMEM; + fud = fuse_dev_alloc_install(fc); + if (!fud) + goto err; + } fc->dev = sb->s_dev; fc->sb = sb; @@ -1191,7 +1201,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (*ctx->fudptr) + if (ctx->fudptr && *ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1200,7 +1210,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - *ctx->fudptr = fud; + if (ctx->fudptr) + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); return 0; @@ -1208,7 +1219,8 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_unlock(&fuse_mutex); dput(root_dentry); err_dev_free: - fuse_dev_free(fud); + if (fud) + fuse_dev_free(fud); err: return err; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index bade74768903..4c4ef5d69298 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -60,6 +60,12 @@ struct virtio_fs_forget { struct virtio_fs_forget_req req; }; +struct virtio_fs_req_work { + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + struct work_struct done_work; +}; + static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); @@ -485,19 +491,67 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) } /* Work function for request completion */ +static void virtio_fs_request_complete(struct fuse_req *req, + struct virtio_fs_vq *fsvq) +{ + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct fuse_args *args; + struct fuse_args_pages *ap; + unsigned int len, i, thislen; + struct page *page; + + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_complete_req_work(struct work_struct *work) +{ + struct virtio_fs_req_work *w = + container_of(work, typeof(*w), done_work); + + virtio_fs_request_complete(w->req, w->fsvq); + kfree(w); +} + static void virtio_fs_requests_done_work(struct work_struct *work) { struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, done_work); struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct virtqueue *vq = fsvq->vq; struct fuse_req *req; - struct fuse_args_pages *ap; struct fuse_req *next; - struct fuse_args *args; - unsigned int len, i, thislen; - struct page *page; + unsigned int len; LIST_HEAD(reqs); /* Collect completed requests off the virtqueue */ @@ -515,38 +569,20 @@ static void virtio_fs_requests_done_work(struct work_struct *work) /* End requests */ list_for_each_entry_safe(req, next, &reqs, list) { - /* - * TODO verify that server properly follows FUSE protocol - * (oh.uniq, oh.len) - */ - args = req->args; - copy_args_from_argbuf(args, req); - - if (args->out_pages && args->page_zeroing) { - len = args->out_args[args->out_numargs - 1].size; - ap = container_of(args, typeof(*ap), args); - for (i = 0; i < ap->num_pages; i++) { - thislen = ap->descs[i].length; - if (len < thislen) { - WARN_ON(ap->descs[i].offset); - page = ap->pages[i]; - zero_user_segment(page, len, thislen); - len = 0; - } else { - len -= thislen; - } - } - } - - spin_lock(&fpq->lock); - clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - spin_unlock(&fpq->lock); - fuse_request_end(fc, req); - spin_lock(&fsvq->lock); - dec_in_flight_req(fsvq); - spin_unlock(&fsvq->lock); + /* blocking async request completes in a worker context */ + if (req->args->may_block) { + struct virtio_fs_req_work *w; + + w = kzalloc(sizeof(*w), GFP_NOFS | __GFP_NOFAIL); + INIT_WORK(&w->done_work, virtio_fs_complete_req_work); + w->fsvq = fsvq; + w->req = req; + schedule_work(&w->done_work); + } else { + virtio_fs_request_complete(req, fsvq); + } } } @@ -1067,7 +1103,7 @@ static int virtio_fs_fill_super(struct super_block *sb) err = -ENOMEM; /* Allocate fuse_dev for hiprio and notification queues */ - for (i = 0; i < VQ_REQUEST; i++) { + for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; fsvq->fud = fuse_dev_alloc(); @@ -1075,18 +1111,15 @@ static int virtio_fs_fill_super(struct super_block *sb) goto err_free_fuse_devs; } - ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + /* virtiofs allocates and installs its own fuse devices */ + ctx.fudptr = NULL; err = fuse_fill_super_common(sb, &ctx); if (err < 0) goto err_free_fuse_devs; - fc = fs->vqs[VQ_REQUEST].fud->fc; - for (i = 0; i < fs->nvqs; i++) { struct virtio_fs_vq *fsvq = &fs->vqs[i]; - if (i == VQ_REQUEST) - continue; /* already initialized */ fuse_dev_install(fsvq->fud, fc); } |