diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 06:30:18 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-13 06:30:18 +0300 |
commit | 9b93f5069fd95cea7915aab321fd74d2548ba75c (patch) | |
tree | f6812b3641a8b395f4cf28b369213c090cee4dad /fs/namespace.c | |
parent | e1212e9b6f06016c62b1ee6fe7772293b90e695a (diff) | |
parent | 5a6f52d20ce3cd6d30103a27f18edff337da191b (diff) | |
download | linux-9b93f5069fd95cea7915aab321fd74d2548ba75c.tar.xz |
Merge tag 'fs.idmapped.mnt_idmap.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping
Pull idmapping updates from Christian Brauner:
"Last cycle we've already made the interaction with idmapped mounts
more robust and type safe by introducing the vfs{g,u}id_t type. This
cycle we concluded the conversion and removed the legacy helpers.
Currently we still pass around the plain namespace that was attached
to a mount. This is in general pretty convenient but it makes it easy
to conflate namespaces that are relevant on the filesystem - with
namespaces that are relevent on the mount level. Especially for
filesystem developers without detailed knowledge in this area this can
be a potential source for bugs.
Instead of passing the plain namespace we introduce a dedicated type
struct mnt_idmap and replace the pointer with a pointer to a struct
mnt_idmap. There are no semantic or size changes for the mount struct
caused by this.
We then start converting all places aware of idmapped mounts to rely
on struct mnt_idmap. Once the conversion is done all helpers down to
the really low-level make_vfs{g,u}id() and from_vfs{g,u}id() will take
a struct mnt_idmap argument instead of two namespace arguments. This
way it becomes impossible to conflate the two removing and thus
eliminating the possibility of any bugs. Fwiw, I fixed some issues in
that area a while ago in ntfs3 and ksmbd in the past. Afterwards only
low-level code can ultimately use the associated namespace for any
permission checks. Even most of the vfs can be completely obivious
about this ultimately and filesystems will never interact with it in
any form in the future.
A struct mnt_idmap currently encompasses a simple refcount and pointer
to the relevant namespace the mount is idmapped to. If a mount isn't
idmapped then it will point to a static nop_mnt_idmap and if it
doesn't that it is idmapped. As usual there are no allocations or
anything happening for non-idmapped mounts. Everthing is carefully
written to be a nop for non-idmapped mounts as has always been the
case.
If an idmapped mount is created a struct mnt_idmap is allocated and a
reference taken on the relevant namespace. Each mount that gets
idmapped or inherits the idmap simply bumps the reference count on
struct mnt_idmap. Just a reminder that we only allow a mount to change
it's idmapping a single time and only if it hasn't already been
attached to the filesystems and has no active writers.
The actual changes are fairly straightforward but this will have huge
benefits for maintenance and security in the long run even if it
causes some churn.
Note that this also makes it possible to extend struct mount_idmap in
the future. For example, it would be possible to place the namespace
pointer in an anonymous union together with an idmapping struct. This
would allow us to expose an api to userspace that would let it specify
idmappings directly instead of having to go through the detour of
setting up namespaces at all"
* tag 'fs.idmapped.mnt_idmap.v6.2' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping:
acl: conver higher-level helpers to rely on mnt_idmap
fs: introduce dedicated idmap type for mounts
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 176 |
1 files changed, 142 insertions, 34 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index c80f422084eb..ab467ee58341 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,22 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; + +/* + * Carries the initial idmapping of 0:0:4294967295 which is an identity + * mapping. This means that {g,u}id 0 is mapped to {g,u}id 0, {g,u}id 1 is + * mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...]. + */ +struct mnt_idmap nop_mnt_idmap = { + .owner = &init_user_ns, + .count = REFCOUNT_INIT(1), +}; +EXPORT_SYMBOL_GPL(nop_mnt_idmap); + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; @@ -82,6 +98,7 @@ struct mount_kattr { unsigned int lookup_flags; bool recurse; struct user_namespace *mnt_userns; + struct mnt_idmap *mnt_idmap; }; /* /sys/fs */ @@ -193,6 +210,104 @@ int mnt_get_count(struct mount *mnt) #endif } +/** + * mnt_idmap_owner - retrieve owner of the mount's idmapping + * @idmap: mount idmapping + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapping. + */ +struct user_namespace *mnt_idmap_owner(const struct mnt_idmap *idmap) +{ + return idmap->owner; +} +EXPORT_SYMBOL_GPL(mnt_idmap_owner); + +/** + * mnt_user_ns - retrieve owner of an idmapped mount + * @mnt: the relevant vfsmount + * + * This helper will go away once the conversion to use struct mnt_idmap + * everywhere has finished at which point the helper will be unexported. + * + * Only code that needs to perform permission checks based on the owner of the + * idmapping will get access to it. All other code will solely rely on + * idmappings. This will get us type safety so it's impossible to conflate + * filesystems idmappings with mount idmappings. + * + * Return: The owner of the idmapped. + */ +struct user_namespace *mnt_user_ns(const struct vfsmount *mnt) +{ + struct mnt_idmap *idmap = mnt_idmap(mnt); + + /* Return the actual owner of the filesystem instead of the nop. */ + if (idmap == &nop_mnt_idmap && + !initial_idmapping(mnt->mnt_sb->s_user_ns)) + return mnt->mnt_sb->s_user_ns; + return mnt_idmap_owner(idmap); +} +EXPORT_SYMBOL_GPL(mnt_user_ns); + +/** + * alloc_mnt_idmap - allocate a new idmapping for the mount + * @mnt_userns: owning userns of the idmapping + * + * Allocate a new struct mnt_idmap which carries the idmapping of the mount. + * + * Return: On success a new idmap, on error an error pointer is returned. + */ +static struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns) +{ + struct mnt_idmap *idmap; + + idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT); + if (!idmap) + return ERR_PTR(-ENOMEM); + + idmap->owner = get_user_ns(mnt_userns); + refcount_set(&idmap->count, 1); + return idmap; +} + +/** + * mnt_idmap_get - get a reference to an idmapping + * @idmap: the idmap to bump the reference on + * + * If @idmap is not the @nop_mnt_idmap bump the reference count. + * + * Return: @idmap with reference count bumped if @not_mnt_idmap isn't passed. + */ +static inline struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap) + refcount_inc(&idmap->count); + + return idmap; +} + +/** + * mnt_idmap_put - put a reference to an idmapping + * @idmap: the idmap to put the reference on + * + * If this is a non-initial idmapping, put the reference count when a mount is + * released and free it if we're the last user. + */ +static inline void mnt_idmap_put(struct mnt_idmap *idmap) +{ + if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) { + put_user_ns(idmap->owner); + kfree(idmap); + } +} + static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -232,7 +347,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); - mnt->mnt.mnt_userns = &init_user_ns; + mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -602,11 +717,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { - struct user_namespace *mnt_userns; - - mnt_userns = mnt_user_ns(&mnt->mnt); - if (!initial_idmapping(mnt_userns)) - put_user_ns(mnt_userns); + mnt_idmap_put(mnt_idmap(&mnt->mnt)); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1009,7 +1120,6 @@ static struct mount *skip_mnt_tree(struct mount *p) struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; - struct user_namespace *fs_userns; if (!fc->root) return ERR_PTR(-EINVAL); @@ -1027,10 +1137,6 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; - fs_userns = mnt->mnt.mnt_sb->s_user_ns; - if (!initial_idmapping(fs_userns)) - mnt->mnt.mnt_userns = get_user_ns(fs_userns); - lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); @@ -1120,9 +1226,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); - mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); - if (!initial_idmapping(mnt->mnt.mnt_userns)) - mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); + mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); + mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -3982,14 +4087,14 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) struct vfsmount *m = &mnt->mnt; struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return 0; /* * Creating an idmapped mount with the filesystem wide idmapping * doesn't make sense so block that. We don't allow mushy semantics. */ - if (kattr->mnt_userns == fs_userns) + if (mnt_idmap_owner(kattr->mnt_idmap) == fs_userns) return -EINVAL; /* @@ -4029,7 +4134,7 @@ static inline bool mnt_allow_writers(const struct mount_kattr *kattr, { return (!(kattr->attr_set & MNT_READONLY) || (mnt->mnt.mnt_flags & MNT_READONLY)) && - !kattr->mnt_userns; + !kattr->mnt_idmap; } static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) @@ -4083,27 +4188,18 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { - struct user_namespace *mnt_userns, *old_mnt_userns; - - if (!kattr->mnt_userns) + if (!kattr->mnt_idmap) return; /* - * We're the only ones able to change the mount's idmapping. So - * mnt->mnt.mnt_userns is stable and we can retrieve it directly. - */ - old_mnt_userns = mnt->mnt.mnt_userns; - - mnt_userns = get_user_ns(kattr->mnt_userns); - /* Pairs with smp_load_acquire() in mnt_user_ns(). */ - smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); - - /* - * If this is an idmapped filesystem drop the reference we've taken - * in vfs_create_mount() before. + * Pairs with smp_load_acquire() in mnt_idmap(). + * + * Since we only allow a mount to change the idmapping once and + * verified this in can_idmap_mount() we know that the mount has + * @nop_mnt_idmap attached to it. So there's no need to drop any + * references. */ - if (!initial_idmapping(old_mnt_userns)) - put_user_ns(old_mnt_userns); + smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4137,6 +4233,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; + if (kattr->mnt_userns) { + struct mnt_idmap *mnt_idmap; + + mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); + if (IS_ERR(mnt_idmap)) + return PTR_ERR(mnt_idmap); + kattr->mnt_idmap = mnt_idmap; + } + if (kattr->propagation) { /* * Only take namespace_lock() if we're actually changing @@ -4324,6 +4429,9 @@ static void finish_mount_kattr(struct mount_kattr *kattr) { put_user_ns(kattr->mnt_userns); kattr->mnt_userns = NULL; + + if (kattr->mnt_idmap) + mnt_idmap_put(kattr->mnt_idmap); } SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, |