From 9f6c61f96f2d97cbb5f7fa85607bc398f843ff0f Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 14 May 2020 16:44:24 +0200 Subject: proc/mounts: add cursor If mounts are deleted after a read(2) call on /proc/self/mounts (or its kin), the subsequent read(2) could miss a mount that comes after the deleted one in the list. This is because the file position is interpreted as the number mount entries from the start of the list. E.g. first read gets entries #0 to #9; the seq file index will be 10. Then entry #5 is deleted, resulting in #10 becoming #9 and #11 becoming #10, etc... The next read will continue from entry #10, and #9 is missed. Solve this by adding a cursor entry for each open instance. Taking the global namespace_sem for write seems excessive, since we are only dealing with a per-namespace list. Instead add a per-namespace spinlock and use that together with namespace_sem taken for read to protect against concurrent modification of the mount list. This may reduce parallelism of is_local_mountpoint(), but it's hardly a big contention point. We could also use RCU freeing of cursors to make traversal not need additional locks, if that turns out to be neceesary. Only move the cursor once for each read (cursor is not added on open) to minimize cacheline invalidation. When EOF is reached, the cursor is taken off the list, in order to prevent an excessive number of cursors due to inactive open file descriptors. Reported-by: Karel Zak Signed-off-by: Miklos Szeredi --- include/linux/mount.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux/mount.h') diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..7edac8c7a9c1 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -50,7 +50,8 @@ struct fs_context; #define MNT_ATIME_MASK (MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME ) #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \ - MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED) + MNT_DOOMED | MNT_SYNC_UMOUNT | MNT_MARKED | \ + MNT_CURSOR) #define MNT_INTERNAL 0x4000 @@ -64,6 +65,7 @@ struct fs_context; #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 #define MNT_UMOUNT 0x8000000 +#define MNT_CURSOR 0x10000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ -- cgit v1.2.3 From df820f8de4e481222b17f9bcee7b909ae8167529 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 4 Jun 2020 10:48:19 +0200 Subject: ovl: make private mounts longterm Overlayfs is using clone_private_mount() to create internal mounts for underlying layers. These are used for operations requiring a path, such as dentry_open(). Since these private mounts are not in any namespace they are treated as short term, "detached" mounts and mntput() involves taking the global mount_lock, which can result in serious cacheline pingpong. Make these private mounts longterm instead, which trade the penalty on mntput() for a slightly longer shutdown time due to an added RCU grace period when putting these mounts. Introduce a new helper kern_unmount_many() that can take care of multiple longterm mounts with a single RCU grace period. Cc: Al Viro Signed-off-by: Miklos Szeredi --- Documentation/filesystems/porting.rst | 7 +++++++ fs/namespace.c | 16 ++++++++++++++++ fs/overlayfs/super.c | 7 ++++++- include/linux/mount.h | 2 ++ 4 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux/mount.h') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 26c093969573..867036aa90b8 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -858,3 +858,10 @@ be misspelled d_alloc_anon(). [should've been added in 2016] stale comment in finish_open() nonwithstanding, failure exits in ->atomic_open() instances should *NOT* fput() the file, no matter what. Everything is handled by the caller. + +--- + +**mandatory** + +clone_private_mount() returns a longterm mount now, so the proper destructor of +its result is kern_unmount() or kern_unmount_array(). diff --git a/fs/namespace.c b/fs/namespace.c index a28e4db075ed..d53517f1d741 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1879,6 +1879,9 @@ struct vfsmount *clone_private_mount(const struct path *path) if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); + /* Longterm mount to be removed by kern_unmount*() */ + new_mnt->mnt_ns = MNT_NS_INTERNAL; + return &new_mnt->mnt; } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -3804,6 +3807,19 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (mnt[i]) + real_mount(mnt[i])->mnt_ns = NULL; + synchronize_rcu_expedited(); + for (i = 0; i < num; i++) + mntput(mnt[i]); +} +EXPORT_SYMBOL(kern_unmount_array); + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index eb81d8760a6a..8d8cd46e1482 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,6 +211,7 @@ static void ovl_destroy_inode(struct inode *inode) static void ovl_free_fs(struct ovl_fs *ofs) { + struct vfsmount **mounts; unsigned i; iput(ofs->workbasedir_trap); @@ -224,10 +225,14 @@ static void ovl_free_fs(struct ovl_fs *ofs) dput(ofs->workbasedir); if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); + + /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->layers; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); - mntput(ofs->layers[i].mnt); + mounts[i] = ofs->layers[i].mnt; } + kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); for (i = 0; i < ofs->numfs; i++) free_anon_bdev(ofs->fs[i].pseudo_dev); diff --git a/include/linux/mount.h b/include/linux/mount.h index bf8cc4108b8f..8de95a0bec8d 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -109,4 +109,6 @@ extern unsigned int sysctl_mount_max; extern bool path_is_mountpoint(const struct path *path); +extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num); + #endif /* _LINUX_MOUNT_H */ -- cgit v1.2.3