diff options
Diffstat (limited to 'fs/namespace.c')
-rw-r--r-- | fs/namespace.c | 1998 |
1 files changed, 1238 insertions, 760 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index 23e81c2a1e3f..ddfd4457d338 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -32,7 +32,7 @@ #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include <linux/mnt_idmapping.h> -#include <linux/nospec.h> +#include <linux/pidfs.h> #include "pnode.h" #include "internal.h" @@ -66,12 +66,12 @@ static int __init set_mphash_entries(char *str) __setup("mphash_entries=", set_mphash_entries); static u64 event; -static DEFINE_IDA(mnt_id_ida); +static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); /* Don't allow confusion with old 32bit mount ID */ #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) -static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET); +static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; static struct hlist_head *mount_hashtable __ro_after_init; static struct hlist_head *mountpoint_hashtable __ro_after_init; @@ -79,15 +79,26 @@ static struct kmem_cache *mnt_cache __ro_after_init; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ -static DEFINE_RWLOCK(mnt_ns_tree_lock); +static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ +static DEFINE_SEQLOCK(mnt_ns_tree_lock); + +#ifdef CONFIG_FSNOTIFY +LIST_HEAD(notify_list); /* protected by namespace_sem */ +#endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ +static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ + +enum mount_kattr_flags_t { + MOUNT_KATTR_RECURSE = (1 << 0), + MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), +}; struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; - bool recurse; + enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; @@ -106,17 +117,6 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); -static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns) -{ - u64 seq_b = ns->seq; - - if (seq < seq_b) - return -1; - if (seq > seq_b) - return 1; - return 0; -} - static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) { if (!node) @@ -124,68 +124,89 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node) return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node); } -static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b) +static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b) { struct mnt_namespace *ns_a = node_to_mnt_ns(a); struct mnt_namespace *ns_b = node_to_mnt_ns(b); u64 seq_a = ns_a->seq; + u64 seq_b = ns_b->seq; - return mnt_ns_cmp(seq_a, ns_b) < 0; + if (seq_a < seq_b) + return -1; + if (seq_a > seq_b) + return 1; + return 0; +} + +static inline void mnt_ns_tree_write_lock(void) +{ + write_seqlock(&mnt_ns_tree_lock); +} + +static inline void mnt_ns_tree_write_unlock(void) +{ + write_sequnlock(&mnt_ns_tree_lock); } static void mnt_ns_tree_add(struct mnt_namespace *ns) { - guard(write_lock)(&mnt_ns_tree_lock); - rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less); + struct rb_node *node, *prev; + + mnt_ns_tree_write_lock(); + node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp); + /* + * If there's no previous entry simply add it after the + * head and if there is add it after the previous entry. + */ + prev = rb_prev(&ns->mnt_ns_tree_node); + if (!prev) + list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list); + else + list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list); + mnt_ns_tree_write_unlock(); + + WARN_ON_ONCE(node); } static void mnt_ns_release(struct mnt_namespace *ns) { - lockdep_assert_not_held(&mnt_ns_tree_lock); - /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } } DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T)) +static void mnt_ns_release_rcu(struct rcu_head *rcu) +{ + mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu)); +} + static void mnt_ns_tree_remove(struct mnt_namespace *ns) { /* remove from global mount namespace list */ if (!is_anon_ns(ns)) { - guard(write_lock)(&mnt_ns_tree_lock); + mnt_ns_tree_write_lock(); rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + list_bidir_del_rcu(&ns->mnt_ns_list); + mnt_ns_tree_write_unlock(); } - mnt_ns_release(ns); + call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu); } -/* - * Returns the mount namespace which either has the specified id, or has the - * next smallest id afer the specified one. - */ -static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) +static int mnt_ns_find(const void *key, const struct rb_node *node) { - struct rb_node *node = mnt_ns_tree.rb_node; - struct mnt_namespace *ret = NULL; + const u64 mnt_ns_id = *(u64 *)key; + const struct mnt_namespace *ns = node_to_mnt_ns(node); - lockdep_assert_held(&mnt_ns_tree_lock); - - while (node) { - struct mnt_namespace *n = node_to_mnt_ns(node); - - if (mnt_ns_id <= n->seq) { - ret = node_to_mnt_ns(node); - if (mnt_ns_id == n->seq) - break; - node = node->rb_left; - } else { - node = node->rb_right; - } - } - return ret; + if (mnt_ns_id < ns->seq) + return -1; + if (mnt_ns_id > ns->seq) + return 1; + return 0; } /* @@ -195,18 +216,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id) * namespace the @namespace_sem must first be acquired. If the namespace has * already shut down before acquiring @namespace_sem, {list,stat}mount() will * see that the mount rbtree of the namespace is empty. + * + * Note the lookup is lockless protected by a sequence counter. We only + * need to guard against false negatives as false positives aren't + * possible. So if we didn't find a mount namespace and the sequence + * counter has changed we need to retry. If the sequence counter is + * still the same we know the search actually failed. */ static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) { - struct mnt_namespace *ns; + struct mnt_namespace *ns; + struct rb_node *node; + unsigned int seq; - guard(read_lock)(&mnt_ns_tree_lock); - ns = mnt_ns_find_id_at(mnt_ns_id); - if (!ns || ns->seq != mnt_ns_id) - return NULL; + guard(rcu)(); + do { + seq = read_seqbegin(&mnt_ns_tree_lock); + node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find); + if (node) + break; + } while (read_seqretry(&mnt_ns_tree_lock, seq)); - refcount_inc(&ns->passive); - return ns; + if (!node) + return NULL; + + /* + * The last reference count is put with RCU delay so we can + * unconditonally acquire a reference here. + */ + ns = node_to_mnt_ns(node); + refcount_inc(&ns->passive); + return ns; } static inline void lock_mount_hash(void) @@ -236,18 +276,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) static int mnt_alloc_id(struct mount *mnt) { - int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); + int res; - if (res < 0) - return res; - mnt->mnt_id = res; - mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr); - return 0; + xa_lock(&mnt_id_xa); + res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL); + if (!res) + mnt->mnt_id_unique = ++mnt_id_ctr; + xa_unlock(&mnt_id_xa); + return res; } static void mnt_free_id(struct mount *mnt) { - ida_free(&mnt_id_ida, mnt->mnt_id); + xa_erase(&mnt_id_xa, mnt->mnt_id); } /* @@ -315,12 +356,13 @@ static struct mount *alloc_vfsmnt(const char *name) if (err) goto out_free_cache; - if (name) { + if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); - if (!mnt->mnt_devname) - goto out_free_id; - } + else + mnt->mnt_devname = "none"; + if (!mnt->mnt_devname) + goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); @@ -339,11 +381,11 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_HEAD(&mnt->mnt_slave_list); + INIT_HLIST_NODE(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); - INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + RB_CLEAR_NODE(&mnt->mnt_node); mnt->mnt.mnt_idmap = &nop_mnt_idmap; } return mnt; @@ -746,15 +788,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; @@ -856,7 +894,7 @@ struct vfsmount *lookup_mnt(const struct path *path) * namespace not just a mount that happens to have some specified * parent mount. */ -bool __is_local_mountpoint(struct dentry *dentry) +bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; @@ -873,42 +911,48 @@ bool __is_local_mountpoint(struct dentry *dentry) return is_covered; } -static struct mountpoint *lookup_mountpoint(struct dentry *dentry) +struct pinned_mountpoint { + struct hlist_node node; + struct mountpoint *mp; +}; + +static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { - mp->m_count++; - return mp; + hlist_add_head(&m->node, &mp->m_list); + m->mp = mp; + return true; } } - return NULL; + return false; } -static struct mountpoint *get_mountpoint(struct dentry *dentry) +static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) { - struct mountpoint *mp, *new = NULL; + struct mountpoint *mp __free(kfree) = NULL; + bool found; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) - return ERR_PTR(-ENOENT); + return -ENOENT; mountpoint: read_seqlock_excl(&mount_lock); - mp = lookup_mountpoint(dentry); + found = lookup_mountpoint(dentry, m); read_sequnlock_excl(&mount_lock); - if (mp) - goto done; + if (found) + return 0; } - if (!new) - new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!new) - return ERR_PTR(-ENOMEM); - + if (!mp) + mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!mp) + return -ENOMEM; /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); @@ -918,34 +962,28 @@ mountpoint: goto mountpoint; /* The dentry is not available as a mountpoint? */ - mp = ERR_PTR(ret); if (ret) - goto done; + return ret; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); - new->m_dentry = dget(dentry); - new->m_count = 1; - hlist_add_head(&new->m_hash, mp_hash(dentry)); - INIT_HLIST_HEAD(&new->m_list); + mp->m_dentry = dget(dentry); + hlist_add_head(&mp->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&mp->m_list); + hlist_add_head(&m->node, &mp->m_list); + m->mp = no_free_ptr(mp); read_sequnlock_excl(&mount_lock); - - mp = new; - new = NULL; -done: - kfree(new); - return mp; + return 0; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ -static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) +static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) { - if (!--mp->m_count) { + if (hlist_empty(&mp->m_list)) { struct dentry *dentry = mp->m_dentry; - BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -955,10 +993,15 @@ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) } } -/* called with namespace_lock and vfsmount lock */ -static void put_mountpoint(struct mountpoint *mp) +/* + * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] + */ +static void unpin_mountpoint(struct pinned_mountpoint *m) { - __put_mountpoint(mp, &ex_mountpoints); + if (m->mp) { + hlist_del(&m->node); + maybe_free_mountpoint(m->mp, &ex_mountpoints); + } } static inline int check_mnt(struct mount *mnt) @@ -966,6 +1009,17 @@ static inline int check_mnt(struct mount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +static inline bool check_anonymous_mnt(struct mount *mnt) +{ + u64 seq; + + if (!is_anon_ns(mnt->mnt_ns)) + return false; + + seq = mnt->mnt_ns->seq_origin; + return !seq || (seq == current->nsproxy->mnt_ns->seq); +} + /* * vfsmount lock must be held for write */ @@ -989,11 +1043,14 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock] */ -static struct mountpoint *unhash_mnt(struct mount *mnt) +static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) { struct mountpoint *mp; + struct mount *parent = mnt->mnt_parent; + if (unlikely(parent->overmount == mnt)) + parent->overmount = NULL; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -1001,15 +1058,15 @@ static struct mountpoint *unhash_mnt(struct mount *mnt) hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; - return mp; + maybe_free_mountpoint(mp, shrink_list); } /* - * vfsmount lock must be held for write + * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) */ static void umount_mnt(struct mount *mnt) { - put_mountpoint(unhash_mnt(mnt)); + __umount_mnt(mnt, &ex_mountpoints); } /* @@ -1019,43 +1076,17 @@ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { - mp->m_count++; - mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } -/** - * mnt_set_mountpoint_beneath - mount a mount beneath another one - * - * @new_parent: the source mount - * @top_mnt: the mount beneath which @new_parent is mounted - * @new_mp: the new mountpoint of @top_mnt on @new_parent - * - * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and - * parent @top_mnt->mnt_parent and mount it on top of @new_parent at - * @new_mp. And mount @new_parent on the old parent and old - * mountpoint of @top_mnt. - * - * Context: This function expects namespace_lock() and lock_mount_hash() - * to have been acquired in that order. - */ -static void mnt_set_mountpoint_beneath(struct mount *new_parent, - struct mount *top_mnt, - struct mountpoint *new_mp) -{ - struct mount *old_top_parent = top_mnt->mnt_parent; - struct mountpoint *old_top_mp = top_mnt->mnt_mp; - - mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent); - mnt_change_mountpoint(new_parent, new_mp, top_mnt); -} - - -static void __attach_mnt(struct mount *mnt, struct mount *parent) +static void make_visible(struct mount *mnt) { + struct mount *parent = mnt->mnt_parent; + if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) + parent->overmount = mnt; hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); @@ -1067,51 +1098,34 @@ static void __attach_mnt(struct mount *mnt, struct mount *parent) * @parent: the parent * @mnt: the new mount * @mp: the new mountpoint - * @beneath: whether to mount @mnt beneath or on top of @parent * - * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt + * Mount @mnt at @mp on @parent. Then attach @mnt * to @parent's child mount list and to @mount_hashtable. * - * If @beneath is true, remove @mnt from its current parent and - * mountpoint and mount it on @mp on @parent, and mount @parent on the - * old parent and old mountpoint of @mnt. Finally, attach @parent to - * @mnt_hashtable and @parent->mnt_parent->mnt_mounts. - * - * Note, when __attach_mnt() is called @mnt->mnt_parent already points + * Note, when make_visible() is called @mnt->mnt_parent already points * to the correct parent. * * Context: This function expects namespace_lock() and lock_mount_hash() * to have been acquired in that order. */ static void attach_mnt(struct mount *mnt, struct mount *parent, - struct mountpoint *mp, bool beneath) + struct mountpoint *mp) { - if (beneath) - mnt_set_mountpoint_beneath(mnt, parent, mp); - else - mnt_set_mountpoint(parent, mp, mnt); - /* - * Note, @mnt->mnt_parent has to be used. If @mnt was mounted - * beneath @parent then @mnt will need to be attached to - * @parent's old parent, not @parent. IOW, @mnt->mnt_parent - * isn't the same mount as @parent. - */ - __attach_mnt(mnt, mnt->mnt_parent); + mnt_set_mountpoint(parent, mp, mnt); + make_visible(mnt); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; - struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); - attach_mnt(mnt, parent, mp, false); + attach_mnt(mnt, parent, mp); - put_mountpoint(old_mp); - mnt_add_count(old_parent, -1); + maybe_free_mountpoint(old_mp, &ex_mountpoints); } static inline struct mount *node_to_mount(struct rb_node *node) @@ -1123,45 +1137,29 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) { struct rb_node **link = &ns->mounts.rb_node; struct rb_node *parent = NULL; + bool mnt_first_node = true, mnt_last_node = true; - WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB); + WARN_ON(mnt_ns_attached(mnt)); mnt->mnt_ns = ns; while (*link) { parent = *link; - if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) + if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { link = &parent->rb_left; - else + mnt_last_node = false; + } else { link = &parent->rb_right; + mnt_first_node = false; + } } + + if (mnt_last_node) + ns->mnt_last_node = &mnt->mnt_node; + if (mnt_first_node) + ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); - mnt->mnt.mnt_flags |= MNT_ONRB; -} -/* - * vfsmount lock must be held for write - */ -static void commit_tree(struct mount *mnt) -{ - struct mount *parent = mnt->mnt_parent; - struct mount *m; - LIST_HEAD(head); - struct mnt_namespace *n = parent->mnt_ns; - - BUG_ON(parent == mnt); - - list_add_tail(&head, &mnt->mnt_list); - while (!list_empty(&head)) { - m = list_first_entry(&head, typeof(*m), mnt_list); - list_del(&m->mnt_list); - - mnt_add_to_ns(n, m); - } - n->nr_mounts += n->pending_mounts; - n->pending_mounts = 0; - - __attach_mnt(mnt, parent); - touch_mnt_namespace(n); + mnt_notify_add(mnt); } static struct mount *next_mnt(struct mount *p, struct mount *root) @@ -1190,6 +1188,27 @@ static struct mount *skip_mnt_tree(struct mount *p) return p; } +/* + * vfsmount lock must be held for write + */ +static void commit_tree(struct mount *mnt) +{ + struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; + + if (!mnt_ns_attached(mnt)) { + for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) + if (unlikely(mnt_ns_attached(m))) + m = skip_mnt_tree(m); + else + mnt_add_to_ns(n, m); + n->nr_mounts += n->pending_mounts; + n->pending_mounts = 0; + } + + make_visible(mnt); + touch_mnt_namespace(n); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1206,7 +1225,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (!fc->root) return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(fc->source ?: "none"); + mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); @@ -1237,6 +1256,15 @@ struct vfsmount *fc_mount(struct fs_context *fc) } EXPORT_SYMBOL(fc_mount); +struct vfsmount *fc_mount_longterm(struct fs_context *fc) +{ + struct vfsmount *mnt = fc_mount(fc); + if (!IS_ERR(mnt)) + real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; + return mnt; +} +EXPORT_SYMBOL(fc_mount_longterm); + struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) @@ -1267,21 +1295,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, } EXPORT_SYMBOL_GPL(vfs_kern_mount); -struct vfsmount * -vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, - const char *name, void *data) -{ - /* Until it is worked out how to pass the user namespace - * through from the parent mount to the submount don't support - * unprivileged mounts with submounts. - */ - if (mountpoint->d_sb->s_user_ns != &init_user_ns) - return ERR_PTR(-EPERM); - - return vfs_kern_mount(type, SB_SUBMOUNT, name, data); -} -EXPORT_SYMBOL_GPL(vfs_submount); - static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -1293,7 +1306,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (!mnt) return ERR_PTR(-ENOMEM); - if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) + mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & + ~MNT_INTERNAL_FLAGS; + + if (flag & (CL_SLAVE | CL_PRIVATE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; @@ -1304,8 +1320,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, goto out_free; } - mnt->mnt.mnt_flags = old->mnt.mnt_flags; - mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB); + if (mnt->mnt_group_id) + set_mnt_shared(mnt); atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); @@ -1318,30 +1334,19 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); - if ((flag & CL_SLAVE) || - ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { - list_add(&mnt->mnt_slave, &old->mnt_slave_list); + if (flag & CL_PRIVATE) // we are done with it + return mnt; + + if (peers(mnt, old)) + list_add(&mnt->mnt_share, &old->mnt_share); + + if ((flag & CL_SLAVE) && old->mnt_group_id) { + hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; - CLEAR_MNT_SHARED(mnt); - } else if (!(flag & CL_PRIVATE)) { - if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) - list_add(&mnt->mnt_share, &old->mnt_share); - if (IS_MNT_SLAVE(old)) - list_add(&mnt->mnt_slave, &old->mnt_slave); + } else if (IS_MNT_SLAVE(old)) { + hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; - } else { - CLEAR_MNT_SHARED(mnt); - } - if (flag & CL_MAKE_SHARED) - set_mnt_shared(mnt); - - /* stick the duplicate mount on the same expiry list - * as the original if that was on one */ - if (flag & CL_EXPIRE) { - if (!list_empty(&old->mnt_expire)) - list_add(&mnt->mnt_expire, &old->mnt_expire); } - return mnt; out_free: @@ -1434,11 +1439,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + if (unlikely(!list_empty(&mnt->mnt_expire))) + list_del(&mnt->mnt_expire); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { - __put_mountpoint(unhash_mnt(p), &list); + __umount_mnt(p, &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } @@ -1635,23 +1642,19 @@ const struct seq_operations mounts_op = { int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); - int actual_refs = 0; - int minimum_refs = 0; - struct mount *p; - BUG_ON(!m); + bool busy = false; /* write lock needed for mnt_get_count */ lock_mount_hash(); - for (p = mnt; p; p = next_mnt(p, mnt)) { - actual_refs += mnt_get_count(p); - minimum_refs += 2; + for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { + if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { + busy = true; + break; + } } unlock_mount_hash(); - if (actual_refs > minimum_refs) - return 0; - - return 1; + return !busy; } EXPORT_SYMBOL(may_umount_tree); @@ -1683,17 +1686,80 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); +#ifdef CONFIG_FSNOTIFY +static void mnt_notify(struct mount *p) +{ + if (!p->prev_ns && p->mnt_ns) { + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } else if (p->prev_ns && !p->mnt_ns) { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + } else if (p->prev_ns == p->mnt_ns) { + fsnotify_mnt_move(p->mnt_ns, &p->mnt); + } else { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } + p->prev_ns = p->mnt_ns; +} + +static void notify_mnt_list(void) +{ + struct mount *m, *tmp; + /* + * Notify about mounts that were added/reparented/detached/remain + * connected after unmount. + */ + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { + mnt_notify(m); + list_del_init(&m->to_notify); + } +} + +static bool need_notify_mnt_list(void) +{ + return !list_empty(¬ify_list); +} +#else +static void notify_mnt_list(void) +{ +} + +static bool need_notify_mnt_list(void) +{ + return false; +} +#endif + +static void free_mnt_ns(struct mnt_namespace *); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; + struct mnt_namespace *ns = emptied_ns; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); + emptied_ns = NULL; - up_write(&namespace_sem); + if (need_notify_mnt_list()) { + /* + * No point blocking out concurrent readers while notifications + * are sent. This will also allow statmount()/listmount() to run + * concurrently. + */ + downgrade_write(&namespace_sem); + notify_mnt_list(); + up_read(&namespace_sem); + } else { + up_write(&namespace_sem); + } + if (unlikely(ns)) { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); + } shrink_dentry_list(&list); @@ -1713,6 +1779,8 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) + enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -1763,10 +1831,9 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; - if (p->mnt.mnt_flags & MNT_ONRB) - move_from_ns(p, &tmp_list); - else - list_move(&p->mnt_list, &tmp_list); + if (mnt_ns_attached(p)) + move_from_ns(p); + list_add_tail(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ @@ -1795,7 +1862,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { - mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); @@ -1806,6 +1872,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); + + /* + * At this point p->mnt_ns is NULL, notification will be queued + * only if + * + * - p->prev_ns is non-NULL *and* + * - p->prev_ns->n_fsnotify_marks is non-NULL + * + * This will preclude queuing the mount if this is a cleanup + * after a failed copy_tree() or destruction of an anonymous + * namespace, etc. + */ + mnt_notify_add(p); } } @@ -1859,7 +1938,7 @@ static int do_umount(struct mount *mnt, int flags) * all race cases, but it's a slowpath. */ lock_mount_hash(); - if (mnt_get_count(mnt) != 2) { + if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } @@ -1905,24 +1984,27 @@ static int do_umount(struct mount *mnt, int flags) namespace_lock(); lock_mount_hash(); - /* Recheck MNT_LOCKED with the locks held */ + /* Repeat the earlier racy checks, now that we are holding the locks */ retval = -EINVAL; + if (!check_mnt(mnt)) + goto out; + if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; + if (!mnt_has_parent(mnt)) /* not the absolute root */ + goto out; + event++; if (flags & MNT_DETACH) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { - if (mnt->mnt.mnt_flags & MNT_ONRB || - !list_empty(&mnt->mnt_list)) - umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -1940,29 +2022,28 @@ out: * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * - * The caller may hold dentry->d_inode->i_mutex. + * The caller may hold dentry->d_inode->i_rwsem. */ void __detach_mounts(struct dentry *dentry) { - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; namespace_lock(); lock_mount_hash(); - mp = lookup_mountpoint(dentry); - if (!mp) + if (!lookup_mountpoint(dentry, &mp)) goto out_unlock; event++; - while (!hlist_empty(&mp->m_list)) { - mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + while (mp.node.next) { + mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } - put_mountpoint(mp); + unpin_mountpoint(&mp); out_unlock: unlock_mount_hash(); namespace_unlock(); @@ -1988,6 +2069,7 @@ static void warn_mandlock(void) static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); + struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; @@ -1997,7 +2079,7 @@ static int can_umount(const struct path *path, int flags) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -2055,9 +2137,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name) static bool is_mnt_ns_file(struct dentry *dentry) { + struct ns_common *ns; + /* Is this a proxy for a mount namespace? */ - return dentry->d_op == &ns_dentry_operations && - dentry->d_fsdata == &mntns_operations; + if (dentry->d_op != &ns_dentry_operations) + return false; + + ns = d_inode(dentry)->i_private; + + return ns->ops == &mntns_operations; } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) @@ -2065,30 +2153,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) return &mnt->ns; } -struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous) +struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) { - guard(read_lock)(&mnt_ns_tree_lock); + guard(rcu)(); + for (;;) { - struct rb_node *node; + struct list_head *list; if (previous) - node = rb_prev(&mntns->mnt_ns_tree_node); + list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list)); else - node = rb_next(&mntns->mnt_ns_tree_node); - if (!node) + list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list)); + if (list_is_head(list, &mnt_ns_list)) return ERR_PTR(-ENOENT); - mntns = node_to_mnt_ns(node); - node = &mntns->mnt_ns_tree_node; + mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list); + /* + * The last passive reference count is put with RCU + * delay so accessing the mount namespace is not just + * safe but all relevant members are still valid. + */ if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) continue; /* - * Holding mnt_ns_tree_lock prevents the mount namespace from - * being freed but it may well be on it's deathbed. We want an - * active reference, not just a passive one here as we're - * persisting the mount namespace. + * We need an active reference count as we're persisting + * the mount namespace and it might already be on its + * deathbed. */ if (!refcount_inc_not_zero(&mntns->ns.count)) continue; @@ -2097,16 +2189,24 @@ struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool pre } } +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) +{ + if (!is_mnt_ns_file(dentry)) + return NULL; + + return to_mnt_ns(get_proc_ns(dentry->d_inode)); +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ - struct mnt_namespace *mnt_ns; - if (!is_mnt_ns_file(dentry)) + struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); + + if (!mnt_ns) return false; - mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -2127,7 +2227,6 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, return dst_mnt; src_parent = src_root; - dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint; list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) @@ -2162,8 +2261,16 @@ struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, if (IS_ERR(dst_mnt)) goto out; lock_mount_hash(); - list_add_tail(&dst_mnt->mnt_list, &res->mnt_list); - attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false); + if (src_mnt->mnt.mnt_flags & MNT_LOCKED) + dst_mnt->mnt.mnt_flags |= MNT_LOCKED; + if (unlikely(flag & CL_EXPIRE)) { + /* stick the duplicate mount on the same expiry + * list as the original if that was on one */ + if (!list_empty(&src_mnt->mnt_expire)) + list_add(&dst_mnt->mnt_expire, + &src_mnt->mnt_expire); + } + attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); unlock_mount_hash(); } } @@ -2178,54 +2285,97 @@ out: return dst_mnt; } -/* Caller should check returned pointer for errors */ +static inline bool extend_array(struct path **res, struct path **to_free, + unsigned n, unsigned *count, unsigned new_count) +{ + struct path *p; + + if (likely(n < *count)) + return true; + p = kmalloc_array(new_count, sizeof(struct path), GFP_KERNEL); + if (p && *count) + memcpy(p, *res, *count * sizeof(struct path)); + *count = new_count; + kfree(*to_free); + *to_free = *res = p; + return p; +} -struct vfsmount *collect_mounts(const struct path *path) +struct path *collect_paths(const struct path *path, + struct path *prealloc, unsigned count) { - struct mount *tree; - namespace_lock(); - if (!check_mnt(real_mount(path->mnt))) - tree = ERR_PTR(-EINVAL); - else - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); - namespace_unlock(); - if (IS_ERR(tree)) - return ERR_CAST(tree); - return &tree->mnt; + struct mount *root = real_mount(path->mnt); + struct mount *child; + struct path *res = prealloc, *to_free = NULL; + unsigned n = 0; + + guard(rwsem_read)(&namespace_sem); + + if (!check_mnt(root)) + return ERR_PTR(-EINVAL); + if (!extend_array(&res, &to_free, 0, &count, 32)) + return ERR_PTR(-ENOMEM); + res[n++] = *path; + list_for_each_entry(child, &root->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, path->dentry)) + continue; + for (struct mount *m = child; m; m = next_mnt(m, child)) { + if (!extend_array(&res, &to_free, n, &count, 2 * count)) + return ERR_PTR(-ENOMEM); + res[n].mnt = &m->mnt; + res[n].dentry = m->mnt.mnt_root; + n++; + } + } + if (!extend_array(&res, &to_free, n, &count, count + 1)) + return ERR_PTR(-ENOMEM); + memset(res + n, 0, (count - n) * sizeof(struct path)); + for (struct path *p = res; p->mnt; p++) + path_get(p); + return res; +} + +void drop_collected_paths(struct path *paths, struct path *prealloc) +{ + for (struct path *p = paths; p->mnt; p++) + path_put(p); + if (paths != prealloc) + kfree(paths); } -static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { - struct mnt_namespace *ns; - namespace_lock(); - lock_mount_hash(); - ns = real_mount(mnt)->mnt_ns; - if (ns) { - if (is_anon_ns(ns)) - umount_tree(real_mount(mnt), UMOUNT_CONNECTED); - else - ns = NULL; + struct mount *m = real_mount(mnt); + + /* + * m used to be the root of anon namespace; if it still is one, + * we need to dissolve the mount tree and free that namespace. + * Let's try to avoid taking namespace_sem if we can determine + * that there's nothing to do without it - rcu_read_lock() is + * enough to make anon_ns_root() memory-safe and once m has + * left its namespace, it's no longer our concern, since it will + * never become a root of anon ns again. + */ + + scoped_guard(rcu) { + if (!anon_ns_root(m)) + return; } - unlock_mount_hash(); - namespace_unlock(); - if (ns) - free_mnt_ns(ns); -} -void drop_collected_mounts(struct vfsmount *mnt) -{ - namespace_lock(); - lock_mount_hash(); - umount_tree(real_mount(mnt), 0); - unlock_mount_hash(); - namespace_unlock(); + scoped_guard(namespace_lock, &namespace_sem) { + if (!anon_ns_root(m)) + return; + + emptied_ns = m->mnt_ns; + lock_mount_hash(); + umount_tree(m, UMOUNT_CONNECTED); + unlock_mount_hash(); + } } -bool has_locked_children(struct mount *mnt, struct dentry *dentry) +static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2239,6 +2389,38 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + bool res; + + read_seqlock_excl(&mount_lock); + res = __has_locked_children(mnt, dentry); + read_sequnlock_excl(&mount_lock); + return res; +} + +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -2247,6 +2429,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * + * This assumes caller has called or done the equivalent of may_mount(). + * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) @@ -2254,48 +2438,42 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - down_read(&namespace_sem); + guard(rwsem_read)(&namespace_sem); + if (IS_MNT_UNBINDABLE(old_mnt)) - goto invalid; + return ERR_PTR(-EINVAL); - if (!check_mnt(old_mnt)) - goto invalid; + /* + * Make sure the source mount is acceptable. + * Anything mounted in our mount namespace is allowed. + * Otherwise, it must be the root of an anonymous mount + * namespace, and we need to make sure no namespace + * loops get created. + */ + if (!check_mnt(old_mnt)) { + if (!anon_ns_root(old_mnt)) + return ERR_PTR(-EINVAL); - if (has_locked_children(old_mnt, path->dentry)) - goto invalid; + if (!check_for_nsfs_mounts(old_mnt)) + return ERR_PTR(-EINVAL); + } - new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); + if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (__has_locked_children(old_mnt, path->dentry)) + return ERR_PTR(-EINVAL); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); if (IS_ERR(new_mnt)) - return ERR_CAST(new_mnt); + return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; - return &new_mnt->mnt; - -invalid: - up_read(&namespace_sem); - return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); -int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, - struct vfsmount *root) -{ - struct mount *mnt; - int res = f(root, arg); - if (res) - return res; - list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { - res = f(&mnt->mnt, arg); - if (res) - return res; - } - return 0; -} - static void lock_mnt_tree(struct mount *mnt) { struct mount *p; @@ -2317,7 +2495,7 @@ static void lock_mnt_tree(struct mount *mnt) if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ - if (list_empty(&p->mnt_expire)) + if (list_empty(&p->mnt_expire) && p != mnt) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } @@ -2338,7 +2516,7 @@ static int invent_group_ids(struct mount *mnt, bool recurse) struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { - if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + if (!p->mnt_group_id) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); @@ -2374,16 +2552,15 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) } enum mnt_tree_flags_t { - MNT_TREE_MOVE = BIT(0), - MNT_TREE_BENEATH = BIT(1), + MNT_TREE_BENEATH = BIT(0), + MNT_TREE_PROPAGATION = BIT(1), }; /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @top_mnt: mount that @source_mnt will be mounted on or mounted beneath + * @dest_mnt: mount that @source_mnt will be mounted on * @dest_mp: the mountpoint @source_mnt will be mounted at - * @flags: modify how @source_mnt is supposed to be attached * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2446,26 +2623,31 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *top_mnt, - struct mountpoint *dest_mp, - enum mnt_tree_flags_t flags) + struct mount *dest_mnt, + struct mountpoint *dest_mp) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); - struct mnt_namespace *ns = top_mnt->mnt_ns; - struct mountpoint *smp; - struct mount *child, *dest_mnt, *p; + struct mnt_namespace *ns = dest_mnt->mnt_ns; + struct pinned_mountpoint root = {}; + struct mountpoint *shorter = NULL; + struct mount *child, *p; + struct mount *top; struct hlist_node *n; int err = 0; - bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH; + bool moving = mnt_has_parent(source_mnt); /* * Preallocate a mountpoint in case the new mounts need to be * mounted beneath mounts on the same mountpoint. */ - smp = get_mountpoint(source_mnt->mnt.mnt_root); - if (IS_ERR(smp)) - return PTR_ERR(smp); + for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { + if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) + shorter = top->mnt_mp; + } + err = get_mountpoint(top->mnt.mnt_root, &root); + if (err) + return err; /* Is there space to add these mounts to the mount namespace? */ if (!moving) { @@ -2474,11 +2656,6 @@ static int attach_recursive_mnt(struct mount *source_mnt, goto out; } - if (beneath) - dest_mnt = top_mnt->mnt_parent; - else - dest_mnt = top_mnt; - if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) @@ -2495,41 +2672,50 @@ static int attach_recursive_mnt(struct mount *source_mnt, } if (moving) { - if (beneath) - dest_mp = smp; - unhash_mnt(source_mnt); - attach_mnt(source_mnt, top_mnt, dest_mp, beneath); - touch_mnt_namespace(source_mnt->mnt_ns); + umount_mnt(source_mnt); + mnt_notify_add(source_mnt); + /* if the mount is moved, it should no longer be expired + * automatically */ + list_del_init(&source_mnt->mnt_expire); } else { if (source_mnt->mnt_ns) { - LIST_HEAD(head); - /* move from anon - the caller will destroy */ + emptied_ns = source_mnt->mnt_ns; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) - move_from_ns(p, &head); - list_del_init(&head); + move_from_ns(p); } - if (beneath) - mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp); - else - mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); - commit_tree(source_mnt); } + mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); + /* + * Now the original copy is in the same state as the secondaries - + * its root attached to mountpoint, but not hashed and all mounts + * in it are either in our namespace or in no namespace at all. + * Add the original to the list of copies and deal with the + * rest of work for all of them uniformly. + */ + hlist_add_head(&source_mnt->mnt_hash, &tree_list); + hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); - q = __lookup_mnt(&child->mnt_parent->mnt, - child->mnt_mountpoint); - if (q) - mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); - child->mnt.mnt_flags &= ~MNT_LOCKED; + q = __lookup_mnt(&child->mnt_parent->mnt, + child->mnt_mountpoint); + if (q) { + struct mountpoint *mp = root.mp; + struct mount *r = child; + while (unlikely(r->overmount)) + r = r->overmount; + if (unlikely(shorter) && child != source_mnt) + mp = shorter; + mnt_change_mountpoint(r, mp, q); + } commit_tree(child); } - put_mountpoint(smp); + unpin_mountpoint(&root); unlock_mount_hash(); return 0; @@ -2546,7 +2732,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); - put_mountpoint(smp); + unpin_mountpoint(&root); read_sequnlock_excl(&mount_lock); return err; @@ -2586,79 +2772,82 @@ static int attach_recursive_mnt(struct mount *source_mnt, * Return: Either the target mountpoint on the top mount or the top * mount's mountpoint. */ -static struct mountpoint *do_lock_mount(struct path *path, bool beneath) +static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) { struct vfsmount *mnt = path->mnt; struct dentry *dentry; - struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; + int err = -ENOENT; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + err = get_mountpoint(dentry, pinned); + if (err) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return 0; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - - return mp; + path_put(&under); + return err; } -static inline struct mountpoint *lock_mount(struct path *path) +static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) { - return do_lock_mount(path, false); + return do_lock_mount(path, m, false); } -static void unlock_mount(struct mountpoint *where) +static void unlock_mount(struct pinned_mountpoint *m) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); - put_mountpoint(where); + unpin_mountpoint(m); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -2670,7 +2859,7 @@ static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp, 0); + return attach_recursive_mnt(mnt, p, mp); } /* @@ -2709,44 +2898,106 @@ static int do_change_type(struct path *path, int ms_flags) return -EINVAL; namespace_lock(); + if (!check_mnt(mnt)) { + err = -EINVAL; + goto out_unlock; + } if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } - lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - unlock_mount_hash(); out_unlock: namespace_unlock(); return err; } +/* may_copy_tree() - check if a mount tree can be copied + * @path: path to the mount tree to be copied + * + * This helper checks if the caller may copy the mount tree starting + * from @path->mnt. The caller may copy the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller tries to copy an nfs mount referring to a mount + * namespace, i.e., the caller is trying to copy a mount namespace + * entry from nsfs. + * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * (4) The caller is trying to copy a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * This is not strictly necessary. The current semantics of the new + * mount api enforce that the caller must be located in the same + * mount namespace as the mount tree it interacts with. Using the + * origin sequence number preserves these semantics even for + * anonymous mount namespaces. However, one could envision extending + * the api to directly operate across mount namespace if needed. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to copy the mount tree. + * + * Returns true if the mount tree can be copied, false otherwise. + */ +static inline bool may_copy_tree(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + const struct dentry_operations *d_op; + + if (check_mnt(mnt)) + return true; + + d_op = path->dentry->d_op; + if (d_op == &ns_dentry_operations) + return true; + + if (d_op == &pidfs_dentry_operations) + return true; + + if (!is_mounted(path->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + + static struct mount *__do_loopback(struct path *old_path, int recurse) { - struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); + struct mount *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) - return mnt; + return ERR_PTR(-EINVAL); - if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) - return mnt; + if (!may_copy_tree(old_path)) + return ERR_PTR(-EINVAL); - if (!recurse && has_locked_children(old, old_path->dentry)) - return mnt; + if (!recurse && __has_locked_children(old, old_path->dentry)) + return ERR_PTR(-EINVAL); if (recurse) - mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); + return copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else - mnt = clone_mnt(old, old_path->dentry, 0); - - if (!IS_ERR(mnt)) - mnt->mnt.mnt_flags &= ~MNT_LOCKED; - - return mnt; + return clone_mnt(old, old_path->dentry, 0); } /* @@ -2757,7 +3008,7 @@ static int do_loopback(struct path *path, const char *old_name, { struct path old_path; struct mount *mnt = NULL, *parent; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; int err; if (!old_name || !*old_name) return -EINVAL; @@ -2769,11 +3020,9 @@ static int do_loopback(struct path *path, const char *old_name, if (mnt_ns_loop(old_path.dentry)) goto out; - mp = lock_mount(path); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = lock_mount(path, &mp); + if (err) goto out; - } parent = real_mount(path->mnt); if (!check_mnt(parent)) @@ -2785,14 +3034,14 @@ static int do_loopback(struct path *path, const char *old_name, goto out2; } - err = graft_tree(mnt, parent, mp); + err = graft_tree(mnt, parent, mp.mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: - unlock_mount(mp); + unlock_mount(&mp); out: path_put(&old_path); return err; @@ -2800,15 +3049,30 @@ out: static struct file *open_detached_copy(struct path *path, bool recursive) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; + struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; + ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); + + /* + * Record the sequence number of the source mount namespace. + * This needs to hold namespace_sem to ensure that the mount + * doesn't get attached. + */ + if (is_mounted(path->mnt)) { + src_mnt_ns = real_mount(path->mnt)->mnt_ns; + if (is_anon_ns(src_mnt_ns)) + ns->seq_origin = src_mnt_ns->seq_origin; + else + ns->seq_origin = src_mnt_ns->seq; + } + mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); @@ -2836,24 +3100,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { - struct file *file; - struct path path; + int ret; + struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; - int error; - int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) - return -EINVAL; + return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) - return -EINVAL; + return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; @@ -2863,27 +3125,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) - return -EPERM; + return ERR_PTR(-EPERM); + + ret = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (detached) + return open_detached_copy(&path, flags & AT_RECURSIVE); + + return dentry_open(&path, O_PATH, current_cred()); +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + int fd; + struct file *file __free(fput) = NULL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; - error = user_path_at(dfd, filename, lookup_flags, &path); - if (unlikely(error)) { - file = ERR_PTR(error); - } else { - if (detached) - file = open_detached_copy(&path, flags & AT_RECURSIVE); - else - file = dentry_open(&path, O_PATH, current_cred()); - path_put(&path); - } - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - fd_install(fd, file); + fd_install(fd, no_free_ptr(file)); return fd; } @@ -3070,28 +3337,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -/* - * Check that there aren't references to earlier/same mount namespaces in the - * specified subtree. Such references can act as pins for mount namespaces - * that aren't checked by the mount-cycle checking code, thereby allowing - * cycles to be made. - */ -static bool check_for_nsfs_mounts(struct mount *subtree) -{ - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) - if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; -} - static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; @@ -3132,7 +3377,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; /* From mount should not have locked children in place of To's root */ - if (has_locked_children(from, to->mnt.mnt_root)) + if (__has_locked_children(from, to->mnt.mnt_root)) goto out; /* Setting sharing groups is only allowed on private mounts */ @@ -3144,18 +3389,14 @@ static int do_set_group(struct path *from_path, struct path *to_path) goto out; if (IS_MNT_SLAVE(from)) { - struct mount *m = from->mnt_master; - - list_add(&to->mnt_slave, &m->mnt_slave_list); - to->mnt_master = m; + hlist_add_behind(&to->mnt_slave, &from->mnt_slave); + to->mnt_master = from->mnt_master; } if (IS_MNT_SHARED(from)) { to->mnt_group_id = from->mnt_group_id; list_add(&to->mnt_share, &from->mnt_share); - lock_mount_hash(); set_mnt_shared(to); - unlock_mount_hash(); } err = 0; @@ -3171,18 +3412,36 @@ out: * Check if path is overmounted, i.e., if there's a mount on top of * @path->mnt with @path->dentry as mountpoint. * - * Context: This function expects namespace_lock() to be held. + * Context: namespace_sem must be held at least shared. + * MUST NOT be called under lock_mount_hash() (there one should just + * call __lookup_mnt() and check if it returns NULL). * Return: If path is overmounted true is returned, false if not. */ static inline bool path_overmounted(const struct path *path) { + unsigned seq = read_seqbegin(&mount_lock); + bool no_child; + rcu_read_lock(); - if (unlikely(__lookup_mnt(path->mnt, path->dentry))) { - rcu_read_unlock(); - return true; - } + no_child = !__lookup_mnt(path->mnt, path->dentry); rcu_read_unlock(); - return false; + if (need_seqretry(&mount_lock, seq)) { + read_seqlock_excl(&mount_lock); + no_child = !__lookup_mnt(path->mnt, path->dentry); + read_sequnlock_excl(&mount_lock); + } + return unlikely(!no_child); +} + +/* + * Check if there is a possibly empty chain of descent from p1 to p2. + * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). + */ +static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) +{ + while (p2 != p1 && mnt_has_parent(p2)) + p2 = p2->mnt_parent; + return p2 == p1; } /** @@ -3236,9 +3495,8 @@ static int can_move_mount_beneath(const struct path *from, if (parent_mnt_to == current->nsproxy->mnt_ns->root) return -EINVAL; - for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent) - if (p == mnt_to) - return -EINVAL; + if (mount_is_ancestor(mnt_to, mnt_from)) + return -EINVAL; /* * If the parent mount propagates to the child mount this would @@ -3261,52 +3519,114 @@ static int can_move_mount_beneath(const struct path *from, * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ - if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + if (check_mnt(mnt_from) && + propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; } -static int do_move_mount(struct path *old_path, struct path *new_path, - bool beneath) +/* may_use_mount() - check if a mount tree can be used + * @mnt: vfsmount to be used + * + * This helper checks if the caller may use the mount tree starting + * from @path->mnt. The caller may use the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller is trying to use a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to use the mount tree. + * + * Returns true if the mount tree can be used, false otherwise. + */ +static inline bool may_use_mount(struct mount *mnt) +{ + if (check_mnt(mnt)) + return true; + + /* + * Make sure that noone unmounted the target path or somehow + * managed to get their hands on something purely kernel + * internal. + */ + if (!is_mounted(&mnt->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + +static int do_move_mount(struct path *old_path, + struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; struct mount *old; struct mount *parent; - struct mountpoint *mp, *old_mp; + struct pinned_mountpoint mp; int err; - bool attached; - enum mnt_tree_flags_t flags = 0; + bool beneath = flags & MNT_TREE_BENEATH; - mp = do_lock_mount(new_path, beneath); - if (IS_ERR(mp)) - return PTR_ERR(mp); + err = do_lock_mount(new_path, &mp, beneath); + if (err) + return err; old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; - attached = mnt_has_parent(old); - if (attached) - flags |= MNT_TREE_MOVE; - old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; - /* The mountpoint must be in our namespace. */ - if (!check_mnt(p)) - goto out; - - /* The thing moved must be mounted... */ - if (!is_mounted(&old->mnt)) - goto out; - - /* ... and either ours or the root of anon namespace */ - if (!(attached ? check_mnt(old) : is_anon_ns(ns))) - goto out; - if (old->mnt.mnt_flags & MNT_LOCKED) - goto out; + if (check_mnt(old)) { + /* if the source is in our namespace... */ + /* ... it should be detachable from parent */ + if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) + goto out; + /* ... and the target should be in our namespace */ + if (!check_mnt(p)) + goto out; + /* parent of the source should not be shared */ + if (IS_MNT_SHARED(parent)) + goto out; + } else { + /* + * otherwise the source must be the root of some anon namespace. + */ + if (!anon_ns_root(old)) + goto out; + /* + * Bail out early if the target is within the same namespace - + * subsequent checks would've rejected that, but they lose + * some corner cases if we check it early. + */ + if (ns == p->mnt_ns) + goto out; + /* + * Target should be either in our namespace or in an acceptable + * anon namespace, sensu check_anonymous_mnt(). + */ + if (!may_use_mount(p)) + goto out; + } if (!path_mounted(old_path)) goto out; @@ -3314,20 +3634,14 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; - /* - * Don't move a mount residing in a shared parent. - */ - if (attached && IS_MNT_SHARED(parent)) - goto out; if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp); + err = can_move_mount_beneath(old_path, new_path, mp.mp); if (err) goto out; err = -EINVAL; p = p->mnt_parent; - flags |= MNT_TREE_BENEATH; } /* @@ -3339,27 +3653,12 @@ static int do_move_mount(struct path *old_path, struct path *new_path, err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; - for (; mnt_has_parent(p); p = p->mnt_parent) - if (p == old) - goto out; - - err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags); - if (err) + if (mount_is_ancestor(old, p)) goto out; - /* if the mount is moved, it should no longer be expire - * automatically */ - list_del_init(&old->mnt_expire); - if (attached) - put_mountpoint(old_mp); + err = attach_recursive_mnt(old, p, mp.mp); out: - unlock_mount(mp); - if (!err) { - if (attached) - mntput_no_expire(parent); - else - free_mnt_ns(ns); - } + unlock_mount(&mp); return err; } @@ -3375,7 +3674,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, false); + err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } @@ -3420,7 +3719,7 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct super_block *sb = fc->root->d_sb; int error; @@ -3441,13 +3740,12 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, mnt_warn_timestamp_expiry(mountpoint, mnt); - mp = lock_mount(mountpoint); - if (IS_ERR(mp)) { - mntput(mnt); - return PTR_ERR(mp); + error = lock_mount(mountpoint, &mp); + if (!error) { + error = do_add_mount(real_mount(mnt), mp.mp, + mountpoint, mnt_flags); + unlock_mount(&mp); } - error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); - unlock_mount(mp); if (error < 0) mntput(mnt); return error; @@ -3515,7 +3813,7 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int finish_automount(struct vfsmount *m, const struct path *path) { struct dentry *dentry = path->dentry; - struct mountpoint *mp; + struct pinned_mountpoint mp = {}; struct mount *mnt; int err; @@ -3525,10 +3823,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) return PTR_ERR(m); mnt = real_mount(m); - /* The new mount record should have at least 2 refs to prevent it being - * expired before we get a chance to add it - */ - BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { @@ -3551,30 +3845,21 @@ int finish_automount(struct vfsmount *m, const struct path *path) err = 0; goto discard_locked; } - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); + err = get_mountpoint(dentry, &mp); + if (err) goto discard_locked; - } - err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(mp); + err = do_add_mount(mnt, mp.mp, path, + path->mnt->mnt_flags | MNT_SHRINKABLE); + unlock_mount(&mp); if (unlikely(err)) goto discard; - mntput(m); return 0; discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: - /* remove m from any expiration list it may be on */ - if (!list_empty(&mnt->mnt_expire)) { - namespace_lock(); - list_del_init(&mnt->mnt_expire); - namespace_unlock(); - } - mntput(m); mntput(m); return err; } @@ -3586,11 +3871,9 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - namespace_lock(); - + read_seqlock_excl(&mount_lock); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - - namespace_unlock(); + read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -3612,11 +3895,14 @@ void mark_mounts_for_expiry(struct list_head *mounts) /* extract from the expiration list every vfsmount that matches the * following criteria: + * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!is_mounted(&mnt->mnt)) + continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; @@ -3835,7 +4121,7 @@ int path_mount(const char *dev_name, struct path *path, data_page); } -long do_mount(const char *dev_name, const char __user *dir_name, +int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; @@ -3905,6 +4191,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a refcount_set(&new_ns->ns.count, 1); refcount_set(&new_ns->passive, 1); new_ns->mounts = RB_ROOT; + INIT_LIST_HEAD(&new_ns->mnt_ns_list); RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node); init_waitqueue_head(&new_ns->poll); new_ns->user_ns = get_user_ns(user_ns); @@ -3940,7 +4227,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) - copy_flags |= CL_SHARED_TO_SLAVE; + copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); @@ -3984,7 +4271,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - mnt_ns_tree_add(new_ns); namespace_unlock(); if (rootmnt) @@ -3992,6 +4278,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, if (pwdmnt) mntput(pwdmnt); + mnt_ns_tree_add(new_ns); return new_ns; } @@ -4215,6 +4502,21 @@ err_unlock: return ret; } +static inline int vfs_move_mount(struct path *from_path, struct path *to_path, + enum mnt_tree_flags_t mflags) +{ + int ret; + + ret = security_move_mount(from_path, to_path); + if (ret) + return ret; + + if (mflags & MNT_TREE_PROPAGATION) + return do_set_group(from_path, to_path); + + return do_move_mount(from_path, to_path, mflags); +} + /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination @@ -4228,8 +4530,12 @@ SYSCALL_DEFINE5(move_mount, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { - struct path from_path, to_path; - unsigned int lflags; + struct path to_path __free(path_put) = {}; + struct path from_path __free(path_put) = {}; + struct filename *to_name __free(putname) = NULL; + struct filename *from_name __free(putname) = NULL; + unsigned int lflags, uflags; + enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) @@ -4242,43 +4548,53 @@ SYSCALL_DEFINE5(move_mount, (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; - /* If someone gives a pathname, they aren't permitted to move - * from an fd that requires unmount as we can't get at the flag - * to clear it afterwards. - */ + if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; + if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; + lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; - - ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); - if (ret < 0) - return ret; + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + uflags = 0; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; + to_name = getname_maybe_null(to_pathname, uflags); + if (IS_ERR(to_name)) + return PTR_ERR(to_name); + + if (!to_name && to_dfd >= 0) { + CLASS(fd_raw, f_to)(to_dfd); + if (fd_empty(f_to)) + return -EBADF; + + to_path = fd_file(f_to)->f_path; + path_get(&to_path); + } else { + ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); + if (ret) + return ret; + } - ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); - if (ret < 0) - goto out_from; + if (!from_name && from_dfd >= 0) { + CLASS(fd_raw, f_from)(from_dfd); + if (fd_empty(f_from)) + return -EBADF; - ret = security_move_mount(&from_path, &to_path); - if (ret < 0) - goto out_to; + return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); + } - if (flags & MOVE_MOUNT_SET_GROUP) - ret = do_set_group(&from_path, &to_path); - else - ret = do_move_mount(&from_path, &to_path, - (flags & MOVE_MOUNT_BENEATH)); + ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); + if (ret) + return ret; -out_to: - path_put(&to_path); -out_from: - path_put(&from_path); - return ret; + return vfs_move_mount(&from_path, &to_path, mflags); } /* @@ -4336,7 +4652,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct mountpoint *old_mp, *root_mp; + struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4357,9 +4673,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; get_fs_root(current->fs, &root); - old_mp = lock_mount(&old); - error = PTR_ERR(old_mp); - if (IS_ERR(old_mp)) + error = lock_mount(&old, &old_mp); + if (error) goto out3; error = -EINVAL; @@ -4386,11 +4701,11 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, if (!path_mounted(&root)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ if (!path_mounted(&new)) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* not attached */ + goto out4; /* absolute root */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; @@ -4399,27 +4714,25 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out4; lock_mount_hash(); umount_mnt(new_mnt); - root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } - /* mount old root on put_old */ - attach_mnt(root_mnt, old_mnt, old_mp, false); /* mount new_root on / */ - attach_mnt(new_mnt, root_parent, root_mp, false); - mnt_add_count(root_parent, -1); + attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); + umount_mnt(root_mnt); + /* mount old root on put_old */ + attach_mnt(root_mnt, old_mnt, old_mp.mp); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); - put_mountpoint(root_mp); unlock_mount_hash(); + mnt_notify_add(root_mnt); + mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: - unlock_mount(old_mp); - if (!error) - mntput_no_expire(ex_parent); + unlock_mount(&old_mp); out3: path_put(&root); out2: @@ -4458,11 +4771,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return -EINVAL; /* - * Once a mount has been idmapped we don't allow it to change its - * mapping. It makes things simpler and callers can just create - * another bind-mount they can idmap if they want to. + * We only allow an mount to change it's idmapping if it has + * never been accessible to userspace. */ - if (is_idmapped_mnt(m)) + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ @@ -4522,7 +4834,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) break; } - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } @@ -4552,18 +4864,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { + struct mnt_idmap *old_idmap; + if (!kattr->mnt_idmap) return; - /* - * Pairs with smp_load_acquire() in mnt_idmap(). - * - * Since we only allow a mount to change the idmapping once and - * verified this in can_idmap_mount() we know that the mount has - * @nop_mnt_idmap attached to it. So there's no need to drop any - * references. - */ + old_idmap = mnt_idmap(&mnt->mnt); + + /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); + mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4583,7 +4893,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); @@ -4613,7 +4923,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) */ namespace_lock(); if (kattr->propagation == MS_SHARED) { - err = invent_group_ids(mnt, kattr->recurse); + err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; @@ -4624,22 +4934,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) err = -EINVAL; lock_mount_hash(); - /* Ensure that this isn't anything purely vfs internal. */ - if (!is_mounted(&mnt->mnt)) - goto out; - - /* - * If this is an attached mount make sure it's located in the callers - * mount namespace. If it's not don't let the caller interact with it. - * - * If this mount doesn't have a parent it's most often simply a - * detached mount with an anonymous mount namespace. IOW, something - * that's simply not attached yet. But there are apparently also users - * that do change mount properties on the rootfs itself. That obviously - * neither has a parent nor is it a detached mount so we cannot - * unconditionally check for detached mounts. - */ - if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt)) + if (!anon_ns_root(mnt) && !check_mnt(mnt)) goto out; /* @@ -4664,7 +4959,7 @@ out: } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; @@ -4672,13 +4967,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; - /* - * We currently do not support clearing an idmapped mount. If this ever - * is a use-case we can revisit this but for now let's keep it simple - * and not allow it. - */ - if (attr->attr_clr & MOUNT_ATTR_IDMAP) - return -EINVAL; + if (attr->attr_clr & MOUNT_ATTR_IDMAP) { + /* + * We can only remove an idmapping if it's never been + * exposed to userspace. + */ + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) + return -EINVAL; + + /* + * Removal of idmappings is equivalent to setting + * nop_mnt_idmap. + */ + if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { + kattr->mnt_idmap = &nop_mnt_idmap; + return 0; + } + } if (attr->userns_fd > INT_MAX) return -EINVAL; @@ -4715,22 +5020,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { - unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - - *kattr = (struct mount_kattr) { - .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), - }; - if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) @@ -4778,35 +5069,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, return -EINVAL; } - return build_mount_idmapped(attr, usize, kattr, flags); + return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { - put_user_ns(kattr->mnt_userns); - kattr->mnt_userns = NULL; + if (kattr->mnt_userns) { + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; + } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } -SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, - unsigned int, flags, struct mount_attr __user *, uattr, - size_t, usize) +static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { - int err; - struct path target; + int ret; struct mount_attr attr; - struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); - if (flags & ~(AT_EMPTY_PATH | - AT_RECURSIVE | - AT_SYMLINK_NOFOLLOW | - AT_NO_AUTOMOUNT)) - return -EINVAL; - if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) @@ -4815,18 +5099,54 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (!may_mount()) return -EPERM; - err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); - if (err) - return err; + ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (ret) + return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) - return 0; + return 0; /* Tell caller to not bother. */ - err = build_mount_kattr(&attr, usize, &kattr, flags); - if (err) + ret = build_mount_kattr(&attr, usize, kattr); + if (ret < 0) + return ret; + + return 1; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_kattr kattr; + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + }; + + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + err = wants_mount_setattr(uattr, usize, &kattr); + if (err <= 0) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); @@ -4838,6 +5158,45 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, + unsigned, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + struct file __free(fput) *file = NULL; + int fd; + + if (!uattr && usize) + return -EINVAL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (uattr) { + int ret; + struct mount_kattr kattr = {}; + + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + ret = wants_mount_setattr(uattr, usize, &kattr); + if (ret > 0) { + ret = do_mount_setattr(&file->f_path, &kattr); + finish_mount_kattr(&kattr); + } + if (ret) + return ret; + } + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + fd_install(fd, no_free_ptr(file)); + return fd; +} + int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) @@ -4861,10 +5220,13 @@ struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; + struct mnt_idmap *idmap; u64 mask; struct path root; - struct statmount sm; struct seq_file seq; + + /* Must be last --ends in a flexible-array member. */ + struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) @@ -4936,7 +5298,7 @@ static void statmount_mnt_basic(struct kstatmount *s) s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); s->sm.mnt_propagation = mnt_to_propagation_flags(m); - s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0; + s->sm.mnt_peer_group = m->mnt_group_id; s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; } @@ -5017,7 +5379,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); - } else if (r->mnt_devname) { + } else { seq_puts(seq, r->mnt_devname); } return 0; @@ -5033,26 +5395,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) { struct vfsmount *mnt = s->mnt; struct super_block *sb = mnt->mnt_sb; + size_t start = seq->count; int err; - if (sb->s_op->show_options) { - size_t start = seq->count; + err = security_sb_show_options(seq, sb); + if (err) + return err; + if (sb->s_op->show_options) { err = sb->s_op->show_options(seq, mnt->mnt_root); if (err) return err; + } - if (unlikely(seq_has_overflowed(seq))) - return -EAGAIN; + if (unlikely(seq_has_overflowed(seq))) + return -EAGAIN; - if (seq->count == start) - return 0; + if (seq->count == start) + return 0; - /* skip leading comma */ - memmove(seq->buf + start, seq->buf + start + 1, - seq->count - start - 1); - seq->count--; - } + /* skip leading comma */ + memmove(seq->buf + start, seq->buf + start + 1, + seq->count - start - 1); + seq->count--; return 0; } @@ -5127,47 +5492,101 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, true); + if (ret < 0) + return ret; + + s->sm.mnt_uidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_UIDMAP; + return 0; +} + +static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, false); + if (ret < 0) + return ret; + + s->sm.mnt_gidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_GIDMAP; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; size_t kbufsize; struct seq_file *seq = &s->seq; struct statmount *sm = &s->sm; - u32 start = seq->count; + u32 start, *offp; + + /* Reserve an empty string at the beginning for any unset offsets */ + if (!seq->count) + seq_putc(seq, 0); + + start = seq->count; switch (flag) { case STATMOUNT_FS_TYPE: - sm->fs_type = start; + offp = &sm->fs_type; ret = statmount_fs_type(s, seq); break; case STATMOUNT_MNT_ROOT: - sm->mnt_root = start; + offp = &sm->mnt_root; ret = statmount_mnt_root(s, seq); break; case STATMOUNT_MNT_POINT: - sm->mnt_point = start; + offp = &sm->mnt_point; ret = statmount_mnt_point(s, seq); break; case STATMOUNT_MNT_OPTS: - sm->mnt_opts = start; + offp = &sm->mnt_opts; ret = statmount_mnt_opts(s, seq); break; case STATMOUNT_OPT_ARRAY: - sm->opt_array = start; + offp = &sm->opt_array; ret = statmount_opt_array(s, seq); break; case STATMOUNT_OPT_SEC_ARRAY: - sm->opt_sec_array = start; + offp = &sm->opt_sec_array; ret = statmount_opt_sec_array(s, seq); break; case STATMOUNT_FS_SUBTYPE: - sm->fs_subtype = start; + offp = &sm->fs_subtype; statmount_fs_subtype(s, seq); break; case STATMOUNT_SB_SOURCE: - sm->sb_source = start; + offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; + case STATMOUNT_MNT_UIDMAP: + sm->mnt_uidmap = start; + ret = statmount_mnt_uidmap(s, seq); + break; + case STATMOUNT_MNT_GIDMAP: + sm->mnt_gidmap = start; + ret = statmount_mnt_gidmap(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5193,6 +5612,7 @@ static int statmount_string(struct kstatmount *s, u64 flag) seq->buf[seq->count++] = '\0'; sm->mask |= flag; + *offp = start; return 0; } @@ -5242,7 +5662,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ - if (RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; @@ -5259,6 +5679,23 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) return 0; } +/* This must be updated whenever a new flag is added */ +#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ + STATMOUNT_MNT_BASIC | \ + STATMOUNT_PROPAGATE_FROM | \ + STATMOUNT_MNT_ROOT | \ + STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | \ + STATMOUNT_MNT_NS_ID | \ + STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | \ + STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | \ + STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_SUPPORTED_MASK | \ + STATMOUNT_MNT_UIDMAP | \ + STATMOUNT_MNT_GIDMAP) + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5267,7 +5704,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, int err; /* Has the namespace already been emptied? */ - if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); @@ -5292,12 +5729,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; - if (s->mask & STATMOUNT_SB_BASIC) - statmount_sb_basic(s); + /* + * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap + * can change concurrently as we only hold the read-side of the + * namespace semaphore and mount properties may change with only + * the mount lock held. + * + * We could sample the mount lock sequence counter to detect + * those changes and retry. But it's not worth it. Worst that + * happens is that the mnt->mnt_idmap pointer is already changed + * while mnt->mnt_flags isn't or vica versa. So what. + * + * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved + * via READ_ONCE()/WRITE_ONCE() and guard against theoretical + * torn read/write. That's all we care about right now. + */ + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); @@ -5325,12 +5779,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_UIDMAP) + err = statmount_string(s, STATMOUNT_MNT_UIDMAP); + + if (!err && s->mask & STATMOUNT_MNT_GIDMAP) + err = statmount_string(s, STATMOUNT_MNT_GIDMAP); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); + if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { + s->sm.mask |= STATMOUNT_SUPPORTED_MASK; + s->sm.supported_mask = STATMOUNT_SUPPORTED; + } + if (err) return err; + /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ + WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); + return 0; } @@ -5348,7 +5816,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, @@ -5525,9 +5994,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, if (!last_mnt_id) { if (reverse) - first = node_to_mount(rb_last(&ns->mounts)); + first = node_to_mount(ns->mnt_last_node); else - first = node_to_mount(rb_first(&ns->mounts)); + first = node_to_mount(ns->mnt_first_node); } else { if (reverse) first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); @@ -5594,6 +6063,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; + /* + * We only need to guard against mount topology changes as + * listmount() doesn't care about any mount properties. + */ scoped_guard(rwsem_read, &namespace_sem) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); @@ -5617,9 +6090,11 @@ static void __init init_mount_tree(void) if (IS_ERR(mnt)) panic("Can't create rootfs"); - ns = alloc_mnt_ns(&init_user_ns, false); + ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); + ns->seq = atomic64_inc_return(&mnt_ns_seq); + ns->ns.inum = PROC_MNT_INIT_INO; m = real_mount(mnt); ns->root = m; ns->nr_mounts = 1; @@ -5629,7 +6104,6 @@ static void __init init_mount_tree(void) root.mnt = mnt; root.dentry = mnt->mnt_root; - mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); @@ -5676,8 +6150,12 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!refcount_dec_and_test(&ns->ns.count)) return; - drop_collected_mounts(&ns->root->mnt); - free_mnt_ns(ns); + namespace_lock(); + emptied_ns = ns; + lock_mount_hash(); + umount_tree(ns->root, 0); + unlock_mount_hash(); + namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) @@ -5927,7 +6405,7 @@ const struct proc_ns_operations mntns_operations = { }; #ifdef CONFIG_SYSCTL -static struct ctl_table fs_namespace_sysctls[] = { +static const struct ctl_table fs_namespace_sysctls[] = { { .procname = "mount-max", .data = &sysctl_mount_max, |