From 32b1924b210a70dcacdf65abd687c5ef86a67541 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 9 Apr 2020 11:29:47 +0300 Subject: ovl: skip overlayfs superblocks at global sync Stacked filesystems like overlayfs has no own writeback, but they have to forward syncfs() requests to backend for keeping data integrity. During global sync() each overlayfs instance calls method ->sync_fs() for backend although it itself is in global list of superblocks too. As a result one syscall sync() could write one superblock several times and send multiple disk barriers. This patch adds flag SB_I_SKIP_SYNC into sb->sb_iflags to avoid that. Reported-by: Dmitry Monakhov Signed-off-by: Konstantin Khlebnikov Reviewed-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..f186a966a36c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1409,6 +1409,8 @@ extern int send_sigurg(struct fown_struct *fown); #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 +#define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ + /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ -- cgit v1.2.3 From df820f8de4e481222b17f9bcee7b909ae8167529 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 4 Jun 2020 10:48:19 +0200 Subject: ovl: make private mounts longterm Overlayfs is using clone_private_mount() to create internal mounts for underlying layers. These are used for operations requiring a path, such as dentry_open(). Since these private mounts are not in any namespace they are treated as short term, "detached" mounts and mntput() involves taking the global mount_lock, which can result in serious cacheline pingpong. Make these private mounts longterm instead, which trade the penalty on mntput() for a slightly longer shutdown time due to an added RCU grace period when putting these mounts. Introduce a new helper kern_unmount_many() that can take care of multiple longterm mounts with a single RCU grace period. Cc: Al Viro Signed-off-by: Miklos Szeredi --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namespace.c | 16 ++++++++++++++++ fs/overlayfs/super.c | 7 ++++++- include/linux/mount.h | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 26c093969573..867036aa90b8 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -858,3 +858,10 @@ be misspelled d_alloc_anon(). [should've been added in 2016] stale comment in finish_open() nonwithstanding, failure exits in ->atomic_open() instances should *NOT* fput() the file, no matter what. Everything is handled by the caller. + +--- + +**mandatory** + +clone_private_mount() returns a longterm mount now, so the proper destructor of +its result is kern_unmount() or kern_unmount_array(). diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..d53517f1d741 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1879,6 +1879,9 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -3804,6 +3807,19 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index eb81d8760a6a..8d8cd46e1482 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,6 +211,7 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_free_fs(struct ovl_fs *ofs) { + struct vfsmount **mounts; unsigned i; iput(ofs->workbasedir_trap); @@ -224,10 +225,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) dput(ofs->workbasedir); if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); - mntput(ofs->layers[i].mnt); + mounts[i] = ofs->layers[i].mnt; } + kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); for (i = 0; i < ofs->numfs; i++) free_anon_bdev(ofs->fs[i].pseudo_dev); diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..8de95a0bec8d 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -109,4 +109,6 @@ extern unsigned int sysctl_mount_max; extern bool path_is_mountpoint(const struct path *path); +extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); + #endif /* _LINUX_MOUNT_H */ -- cgit v1.2.3