From 035ff33cf4db101250fb980a3941bf078f37a544 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 20 Apr 2022 16:05:41 +0200 Subject: fuse: write inode in fuse_release() A race between write(2) and close(2) allows pages to be dirtied after fuse_flush -> write_inode_now(). If these pages are not flushed from fuse_release(), then there might not be a writable open file later. So any remaining dirty pages must be written back before the file is released. This is a partial revert of the blamed commit. Reported-by: syzbot+6e1efbd8efaaa6860e91@syzkaller.appspotmail.com Fixes: 36ea23374d1f ("fuse: write inode in fuse_vma_close() instead of fuse_release()") Cc: # v5.16 Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 05caa2b9272e..60885ff9157c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -338,6 +338,15 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { + struct fuse_conn *fc = get_fuse_conn(inode); + + /* + * Dirty pages might remain despite write_inode_now() call from + * fuse_flush() due to writes racing with the close. + */ + if (fc->writeback_cache) + write_inode_now(inode, 1); + fuse_release_common(file, false); /* return value is ignored by VFS */ -- cgit v1.2.3 From 2fdbb8dd01556e1501132b5ad3826e8f71e24a8b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 22 Apr 2022 15:48:53 +0200 Subject: fuse: fix deadlock between atomic O_TRUNC and page invalidation fuse_finish_open() will be called with FUSE_NOWRITE set in case of atomic O_TRUNC open(), so commit 76224355db75 ("fuse: truncate pagecache on atomic_o_trunc") replaced invalidate_inode_pages2() by truncate_pagecache() in such a case to avoid the A-A deadlock. However, we found another A-B-B-A deadlock related to the case above, which will cause the xfstests generic/464 testcase hung in our virtio-fs test environment. For example, consider two processes concurrently open one same file, one with O_TRUNC and another without O_TRUNC. The deadlock case is described below, if open(O_TRUNC) is already set_nowrite(acquired A), and is trying to lock a page (acquiring B), open() could have held the page lock (acquired B), and waiting on the page writeback (acquiring A). This would lead to deadlocks. open(O_TRUNC) ---------------------------------------------------------------- fuse_open_common inode_lock [C acquire] fuse_set_nowrite [A acquire] fuse_finish_open truncate_pagecache lock_page [B acquire] truncate_inode_page unlock_page [B release] fuse_release_nowrite [A release] inode_unlock [C release] ---------------------------------------------------------------- open() ---------------------------------------------------------------- fuse_open_common fuse_finish_open invalidate_inode_pages2 lock_page [B acquire] fuse_launder_page fuse_wait_on_page_writeback [A acquire & release] unlock_page [B release] ---------------------------------------------------------------- Besides this case, all calls of invalidate_inode_pages2() and invalidate_inode_pages2_range() in fuse code also can deadlock with open(O_TRUNC). Fix by moving the truncate_pagecache() call outside the nowrite protected region. The nowrite protection is only for delayed writeback (writeback_cache) case, where inode lock does not protect against truncation racing with writes on the server. Write syscalls racing with page cache truncation still get the inode lock protection. This patch also changes the order of filemap_invalidate_lock() vs. fuse_set_nowrite() in fuse_open_common(). This new order matches the order found in fuse_file_fallocate() and fuse_do_setattr(). Reported-by: Jiachen Zhang Tested-by: Jiachen Zhang Fixes: e4648309b85a ("fuse: truncate pending writes on O_TRUNC") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 7 ++++++- fs/fuse/file.c | 30 +++++++++++++++++------------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 74303d6e987b..a93d675a726a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -537,6 +537,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, struct fuse_file *ff; void *security_ctx = NULL; u32 security_ctxlen; + bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); @@ -561,7 +562,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.mode = mode; inarg.umask = current_umask(); - if (fm->fc->handle_killpriv_v2 && (flags & O_TRUNC) && + if (fm->fc->handle_killpriv_v2 && trunc && !(flags & O_EXCL) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } @@ -623,6 +624,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, } else { file->private_data = ff; fuse_finish_open(inode, file); + if (fm->fc->atomic_o_trunc && trunc) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); } return err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 60885ff9157c..dfee142bca5c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,13 +210,9 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); - truncate_pagecache(inode, 0); file_update_time(file); fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); - } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { - invalidate_inode_pages2(inode->i_mapping); } - if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -239,30 +235,38 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (is_wb_truncate || dax_truncate) { + if (is_wb_truncate || dax_truncate) inode_lock(inode); - fuse_set_nowrite(inode); - } if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) - goto out; + goto out_inode_unlock; } + if (is_wb_truncate || dax_truncate) + fuse_set_nowrite(inode); + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); -out: + if (is_wb_truncate || dax_truncate) + fuse_release_nowrite(inode); + if (!err) { + struct fuse_file *ff = file->private_data; + + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) + truncate_pagecache(inode, 0); + else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) + invalidate_inode_pages2(inode->i_mapping); + } if (dax_truncate) filemap_invalidate_unlock(inode->i_mapping); - - if (is_wb_truncate | dax_truncate) { - fuse_release_nowrite(inode); +out_inode_unlock: + if (is_wb_truncate || dax_truncate) inode_unlock(inode); - } return err; } -- cgit v1.2.3 From 47e301491c4f52ca744c7660cf57492b902261f6 Mon Sep 17 00:00:00 2001 From: Jeffle Xu Date: Sat, 2 Apr 2022 18:32:50 +0800 Subject: fuse: avoid unnecessary spinlock bump Move dmap free worker kicker inside the critical region, so that extra spinlock lock/unlock could be avoided. Suggested-by: Liu Jiang Signed-off-by: Jeffle Xu Reviewed-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 10eb50cbf398..e23e802a8013 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -138,9 +138,9 @@ static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) WARN_ON(fcd->nr_free_ranges <= 0); fcd->nr_free_ranges--; } + __kick_dmap_free_worker(fcd, 0); spin_unlock(&fcd->lock); - kick_dmap_free_worker(fcd, 0); return dmap; } -- cgit v1.2.3 From 47912eaa061a6a81e4aa790591a1874c650733c0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 21 Jul 2022 16:06:18 +0200 Subject: fuse: limit nsec Limit nanoseconds to 0..999999999. Fixes: d8a5ba45457e ("[PATCH] FUSE - core") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8c0665c5dff8..7c290089e693 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -180,6 +180,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_uid = make_kuid(fc->user_ns, attr->uid); inode->i_gid = make_kgid(fc->user_ns, attr->gid); inode->i_blocks = attr->blocks; + + /* Sanitize nsecs */ + attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1); + attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); + attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); + inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ -- cgit v1.2.3 From 02c0cab8e7345b06f1c0838df444e2902e4138d3 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 21 Jul 2022 16:06:18 +0200 Subject: fuse: ioctl: translate ENOSYS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overlayfs may fail to complete updates when a filesystem lacks fileattr/xattr syscall support and responds with an ENOSYS error code, resulting in an unexpected "Function not implemented" error. This bug may occur with FUSE filesystems, such as davfs2. Steps to reproduce: # install davfs2, e.g., apk add davfs2 mkdir /test mkdir /test/lower /test/upper /test/work /test/mnt yes '' | mount -t davfs -o ro http://some-web-dav-server/path \ /test/lower mount -t overlay -o upperdir=/test/upper,lowerdir=/test/lower \ -o workdir=/test/work overlay /test/mnt # when "some-file" exists in the lowerdir, this fails with "Function # not implemented", with dmesg showing "overlayfs: failed to retrieve # lower fileattr (/some-file, err=-38)" touch /test/mnt/some-file The underlying cause of this regresion is actually in FUSE, which fails to translate the ENOSYS error code returned by userspace filesystem (which means that the ioctl operation is not supported) to ENOTTY. Reported-by: Christian Kohlschütter Fixes: 72db82115d2b ("ovl: copy up sync/noatime fileattr flags") Fixes: 59efec7b9039 ("fuse: implement ioctl support") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/ioctl.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 33cde4bbccdc..61d8afcb10a3 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,6 +9,17 @@ #include #include +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +{ + ssize_t ret = fuse_simple_request(fm, args); + + /* Translate ENOSYS, which shouldn't be returned from fs */ + if (ret == -ENOSYS) + ret = -ENOTTY; + + return ret; +} + /* * CUSE servers compiled on 32bit broke on 64bit kernels because the * ABI was defined to be 'struct iovec' which is different on 32bit @@ -259,7 +270,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -393,7 +404,7 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_simple_request(fm, &args); + err = fuse_send_ioctl(fm, &args); if (!err) { if (outarg.result < 0) err = outarg.result; -- cgit v1.2.3 From c64797809a64c73497082aa05e401a062ec1af34 Mon Sep 17 00:00:00 2001 From: Xie Yongji Date: Mon, 18 Jul 2022 16:50:12 +0800 Subject: fuse: Remove the control interface for virtio-fs The commit 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") tries to remove the control interface for virtio-fs since it does not support aborting requests which are being processed. But it doesn't work now. This patch fixes it by skipping creating the control interface if fuse_conn->no_control is set. Fixes: 15c8e72e88e0 ("fuse: allow skipping control interface and forced unmount") Signed-off-by: Xie Yongji Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 7cede9a3bc96..247ef4f76761 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -258,7 +258,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) struct dentry *parent; char name[32]; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return 0; parent = fuse_control_sb->s_root; @@ -296,7 +296,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) { int i; - if (!fuse_control_sb) + if (!fuse_control_sb || fc->no_control) return; for (i = fc->ctl_ndents - 1; i >= 0; i--) { -- cgit v1.2.3 From 9ccf47b26b73ecf5b7278a4cb8d487d8ebb4c095 Mon Sep 17 00:00:00 2001 From: Dave Marchevsky Date: Mon, 11 Jul 2022 10:48:08 -0700 Subject: fuse: Add module param for CAP_SYS_ADMIN access bypassing allow_other Since commit 73f03c2b4b52 ("fuse: Restrict allow_other to the superblock's namespace or a descendant"), access to allow_other FUSE filesystems has been limited to users in the mounting user namespace or descendants. This prevents a process that is privileged in its userns - but not its parent namespaces - from mounting a FUSE fs w/ allow_other that is accessible to processes in parent namespaces. While this restriction makes sense overall it breaks a legitimate usecase: I have a tracing daemon which needs to peek into process' open files in order to symbolicate - similar to 'perf'. The daemon is a privileged process in the root userns, but is unable to peek into FUSE filesystems mounted by processes in child namespaces. This patch adds a module param, allow_sys_admin_access, to act as an escape hatch for this descendant userns logic and for the allow_other mount option in general. Setting allow_sys_admin_access allows processes with CAP_SYS_ADMIN in the initial userns to access FUSE filesystems irrespective of the mounting userns or whether allow_other was set. A sysadmin setting this param must trust FUSEs on the host to not DoS processes as described in 73f03c2b4b52. Signed-off-by: Dave Marchevsky Reviewed-by: Christian Brauner (Microsoft) Signed-off-by: Miklos Szeredi --- Documentation/filesystems/fuse.rst | 29 ++++++++++++++++++++++++----- fs/fuse/dir.c | 9 +++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/fuse.rst b/Documentation/filesystems/fuse.rst index 8120c3c0cb4e..1e31e87aee68 100644 --- a/Documentation/filesystems/fuse.rst +++ b/Documentation/filesystems/fuse.rst @@ -279,7 +279,7 @@ How are requirements fulfilled? the filesystem or not. Note that the *ptrace* check is not strictly necessary to - prevent B/2/i, it is enough to check if mount owner has enough + prevent C/2/i, it is enough to check if mount owner has enough privilege to send signal to the process accessing the filesystem, since *SIGSTOP* can be used to get a similar effect. @@ -288,10 +288,29 @@ I think these limitations are unacceptable? If a sysadmin trusts the users enough, or can ensure through other measures, that system processes will never enter non-privileged -mounts, it can relax the last limitation with a 'user_allow_other' -config option. If this config option is set, the mounting user can -add the 'allow_other' mount option which disables the check for other -users' processes. +mounts, it can relax the last limitation in several ways: + + - With the 'user_allow_other' config option. If this config option is + set, the mounting user can add the 'allow_other' mount option which + disables the check for other users' processes. + + User namespaces have an unintuitive interaction with 'allow_other': + an unprivileged user - normally restricted from mounting with + 'allow_other' - could do so in a user namespace where they're + privileged. If any process could access such an 'allow_other' mount + this would give the mounting user the ability to manipulate + processes in user namespaces where they're unprivileged. For this + reason 'allow_other' restricts access to users in the same userns + or a descendant. + + - With the 'allow_sys_admin_access' module option. If this option is + set, super user's processes have unrestricted access to mounts + irrespective of allow_other setting or user namespace of the + mounting user. + +Note that both of these relaxations expose the system to potential +information leak or *DoS* as described in points B and C/2/i-ii in the +preceding section. Kernel - userspace interface ============================ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index a93d675a726a..b585b04e815e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,11 @@ #include #include +static bool __read_mostly allow_sys_admin_access; +module_param(allow_sys_admin_access, bool, 0644); +MODULE_PARM_DESC(allow_sys_admin_access, + "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -1229,6 +1235,9 @@ int fuse_allow_current_process(struct fuse_conn *fc) { const struct cred *cred; + if (allow_sys_admin_access && capable(CAP_SYS_ADMIN)) + return 1; + if (fc->allow_other) return current_in_userns(fc->user_ns); -- cgit v1.2.3 From 1e5b9e048cda4d284827d22546a4cb0904689c5d Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Thu, 9 Jun 2022 22:08:38 -0400 Subject: virtiofs: delete unused parameter for virtio_fs_cleanup_vqs fs parameter not used. So, it needs to be deleted. Signed-off-by: Deming Wang Reviewed-by: Stefan Hajnoczi Reviewed-by: Vivek Goyal Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8db53fa67359..0991199d19c1 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -741,8 +741,7 @@ out: } /* Free virtqueues (device must already be reset) */ -static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, - struct virtio_fs *fs) +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev) { vdev->config->del_vqs(vdev); } @@ -895,7 +894,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); kfree(fs->vqs); out: @@ -927,7 +926,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) virtio_fs_stop_all_queues(fs); virtio_fs_drain_all_queues_locked(fs); virtio_reset_device(vdev); - virtio_fs_cleanup_vqs(vdev, fs); + virtio_fs_cleanup_vqs(vdev); vdev->priv = NULL; /* Put device reference on virtio_fs object */ -- cgit v1.2.3 From 73fb2c8b61783e2e8a87f91d141bf72a12404566 Mon Sep 17 00:00:00 2001 From: Deming Wang Date: Wed, 22 Jun 2022 17:17:58 -0400 Subject: virtio_fs: Modify format for virtio_fs_direct_access We should isolate operators with spaces. Signed-off-by: Deming Wang Signed-off-by: Miklos Szeredi --- fs/fuse/virtio_fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0991199d19c1..4d8d4f16c727 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -756,7 +756,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, { struct virtio_fs *fs = dax_get_private(dax_dev); phys_addr_t offset = PFN_PHYS(pgoff); - size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + size_t max_nr_pages = fs->window_len / PAGE_SIZE - pgoff; if (kaddr) *kaddr = fs->window_kaddr + offset; -- cgit v1.2.3 From 04b9407197789c81fffac52921e703cb47967d6a Mon Sep 17 00:00:00 2001 From: Daniil Lunev Date: Wed, 27 Jul 2022 16:44:24 +1000 Subject: vfs: function to prevent re-use of block-device-based superblocks The function is to be called from filesystem-specific code to mark a superblock to be ignored by superblock test and thus never re-used. The function also unregisters bdi if the bdi is per-superblock to avoid collision if a new superblock is created to represent the filesystem. generic_shutdown_super() skips unregistering bdi for a retired superlock as it assumes retire function has already done it. This patch adds the functionality only for the block-device-based supers, since the primary use case of the feature is to gracefully handle force unmount of external devices, mounted with FUSE. This can be further extended to cover all superblocks, if the need arises. Signed-off-by: Daniil Lunev Reviewed-by: Christoph Hellwig Signed-off-by: Miklos Szeredi --- fs/super.c | 33 +++++++++++++++++++++++++++++++-- include/linux/fs.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/fs/super.c b/fs/super.c index 60f57c7bc0a6..258d9257b0a7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -422,6 +422,35 @@ bool trylock_super(struct super_block *sb) return false; } +/** + * retire_super - prevents superblock from being reused + * @sb: superblock to retire + * + * The function marks superblock to be ignored in superblock test, which + * prevents it from being reused for any new mounts. If the superblock has + * a private bdi, it also unregisters it, but doesn't reduce the refcount + * of the superblock to prevent potential races. The refcount is reduced + * by generic_shutdown_super(). The function can not be called + * concurrently with generic_shutdown_super(). It is safe to call the + * function multiple times, subsequent calls have no effect. + * + * The marker will affect the re-use only for block-device-based + * superblocks. Other superblocks will still get marked if this function + * is used, but that will not affect their reusability. + */ +void retire_super(struct super_block *sb) +{ + WARN_ON(!sb->s_bdev); + down_write(&sb->s_umount); + if (sb->s_iflags & SB_I_PERSB_BDI) { + bdi_unregister(sb->s_bdi); + sb->s_iflags &= ~SB_I_PERSB_BDI; + } + sb->s_iflags |= SB_I_RETIRED; + up_write(&sb->s_umount); +} +EXPORT_SYMBOL(retire_super); + /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill @@ -1216,7 +1245,7 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { - return s->s_bdev == fc->sget_key; + return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; } /** @@ -1307,7 +1336,7 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; } struct dentry *mount_bdev(struct file_system_type *fs_type, diff --git a/include/linux/fs.h b/include/linux/fs.h index 9ad5e3520fae..246120ee0573 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1412,6 +1412,7 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ +#define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ /* Possible states of 'frozen' field */ enum { @@ -2432,6 +2433,7 @@ extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); +void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); -- cgit v1.2.3 From 247861c325c2e4f5ad3c2f9a77ab9d85d15cbcfc Mon Sep 17 00:00:00 2001 From: Daniil Lunev Date: Wed, 27 Jul 2022 16:44:25 +1000 Subject: fuse: retire block-device-based superblock on force unmount Force unmount of FUSE severes the connection with the user space, even if there are still open files. Subsequent remount tries to re-use the superblock held by the open files, which is meaningless in the FUSE case after disconnect - reused super block doesn't have userspace counterpart attached to it and is incapable of doing any IO. This patch adds the functionality only for the block-device-based supers, since the primary use case of the feature is to gracefully handle force unmount of external devices, mounted with FUSE. This can be further extended to cover all superblocks, if the need arises. Signed-off-by: Daniil Lunev Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7c290089e693..6b3beda16c1b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -482,8 +482,14 @@ static void fuse_umount_begin(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - if (!fc->no_force_umount) - fuse_abort_conn(fc); + if (fc->no_force_umount) + return; + + fuse_abort_conn(fc); + + // Only retire block-device-based superblocks. + if (sb->s_bdev != NULL) + retire_super(sb); } static void fuse_send_destroy(struct fuse_mount *fm) -- cgit v1.2.3