diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-18 18:20:31 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-04-18 18:20:31 +0300 |
commit | 8f502d5b9e3362971f58dad5d468f070340336e1 (patch) | |
tree | fa819b5acdfc43c01e3696991ffe90836f265a34 | |
parent | 06a60deca87dba8e2c186ea7f12ea87d6785188e (diff) | |
parent | e0c9c0afd2fc958ffa34b697972721d81df8a56f (diff) | |
download | linux-8f502d5b9e3362971f58dad5d468f070340336e1.tar.xz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull usernamespace mount fixes from Eric Biederman:
"Way back in October Andrey Vagin reported that umount(MNT_DETACH)
could be used to defeat MNT_LOCKED. As I worked to fix this I
discovered that combined with mount propagation and an appropriate
selection of shared subtrees a reference to a directory on an
unmounted filesystem is not necessary.
That MNT_DETACH is allowed in user namespace in a form that can break
MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does.
To avoid breaking existing userspace the conflict between MNT_DETACH
and MNT_LOCKED is fixed by leaving mounts that are locked to their
parents in the mount hash table until the last reference goes away.
While investigating this issue I also found an issue with
__detach_mounts. The code was unnecessarily and incorrectly
triggering mount propagation. Resulting in too many mounts going away
when a directory is deleted, and too many cpu cycles are burned while
doing that.
Looking some more I realized that __detach_mounts by only keeping
mounts connected that were MNT_LOCKED it had the potential to still
leak information so I tweaked the code to keep everything locked
together that possibly could be.
This code was almost ready last cycle but Al invented fs_pin which
slightly simplifies this code but required rewrites and retesting, and
I have not been in top form for a while so it took me a while to get
all of that done. Similiarly this pull request is late because I have
been feeling absolutely miserable all week.
The issue of being able to escape a bind mount has not yet been
addressed, as the fixes are not yet mature"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
mnt: Update detach_mounts to leave mounts connected
mnt: Fix the error check in __detach_mounts
mnt: Honor MNT_LOCKED when detaching mounts
fs_pin: Allow for the possibility that m_list or s_list go unused.
mnt: Factor umount_mnt from umount_tree
mnt: Factor out unhash_mnt from detach_mnt and umount_tree
mnt: Fail collect_mounts when applied to unmounted mounts
mnt: Don't propagate unmounts to locked mounts
mnt: On an unmount propagate clearing of MNT_LOCKED
mnt: Delay removal from the mount hash.
mnt: Add MNT_UMOUNT flag
mnt: In umount_tree reuse mnt_list instead of mnt_hash
mnt: Don't propagate umounts in __detach_mounts
mnt: Improve the umount_tree flags
mnt: Use hlist_move_list in namespace_unlock
-rw-r--r-- | fs/fs_pin.c | 4 | ||||
-rw-r--r-- | fs/namespace.c | 142 | ||||
-rw-r--r-- | fs/pnode.c | 60 | ||||
-rw-r--r-- | fs/pnode.h | 7 | ||||
-rw-r--r-- | include/linux/fs_pin.h | 2 | ||||
-rw-r--r-- | include/linux/mount.h | 1 |
6 files changed, 159 insertions, 57 deletions
diff --git a/fs/fs_pin.c b/fs/fs_pin.c index b06c98796afb..611b5408f6ec 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock); void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); - hlist_del(&pin->m_list); - hlist_del(&pin->s_list); + hlist_del_init(&pin->m_list); + hlist_del_init(&pin->s_list); spin_unlock(&pin_lock); spin_lock_irq(&pin->wait.lock); pin->done = 1; diff --git a/fs/namespace.c b/fs/namespace.c index 82ef1405260e..1f4f9dac6e5a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { - struct mount *p, *res; - res = p = __lookup_mnt(mnt, dentry); + struct mount *p, *res = NULL; + p = __lookup_mnt(mnt, dentry); if (!p) goto out; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; hlist_for_each_entry_continue(p, mnt_hash) { if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - res = p; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; } out: return res; @@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) /* * vfsmount lock must be held for write */ -static void detach_mnt(struct mount *mnt, struct path *old_path) +static void unhash_mnt(struct mount *mnt) { - old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -811,6 +812,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) /* * vfsmount lock must be held for write */ +static void detach_mnt(struct mount *mnt, struct path *old_path) +{ + old_path->dentry = mnt->mnt_mountpoint; + old_path->mnt = &mnt->mnt_parent->mnt; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ +static void umount_mnt(struct mount *mnt) +{ + /* old mountpoint will be dropped when we can do that */ + mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) @@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + + if (unlikely(!list_empty(&mnt->mnt_mounts))) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + umount_mnt(p); + } + } unlock_mount_hash(); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { @@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { - struct hlist_head head = unmounted; + struct hlist_head head; - if (likely(hlist_empty(&head))) { - up_write(&namespace_sem); - return; - } + hlist_move_list(&unmounted, &head); - head.first->pprev = &head.first; - INIT_HLIST_HEAD(&unmounted); up_write(&namespace_sem); + if (likely(hlist_empty(&head))) + return; + synchronize_rcu(); group_pin_kill(&head); @@ -1319,49 +1345,63 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +enum umount_tree_flags { + UMOUNT_SYNC = 1, + UMOUNT_PROPAGATE = 2, + UMOUNT_CONNECTED = 4, +}; /* * mount_lock must be held * namespace_sem must be held for write - * how = 0 => just this tree, don't propagate - * how = 1 => propagate; we know that nobody else has reference to any victims - * how = 2 => lazy umount */ -void umount_tree(struct mount *mnt, int how) +static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { - HLIST_HEAD(tmp_list); + LIST_HEAD(tmp_list); struct mount *p; + if (how & UMOUNT_PROPAGATE) + propagate_mount_unlock(mnt); + + /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { - hlist_del_init_rcu(&p->mnt_hash); - hlist_add_head(&p->mnt_hash, &tmp_list); + p->mnt.mnt_flags |= MNT_UMOUNT; + list_move(&p->mnt_list, &tmp_list); } - hlist_for_each_entry(p, &tmp_list, mnt_hash) + /* Hide the mounts from mnt_mounts */ + list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); + } - if (how) + /* Add propogated mounts to the tmp_list */ + if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); - while (!hlist_empty(&tmp_list)) { - p = hlist_entry(tmp_list.first, struct mount, mnt_hash); - hlist_del_init_rcu(&p->mnt_hash); + while (!list_empty(&tmp_list)) { + bool disconnect; + p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; - if (how < 2) + if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); + disconnect = !(((how & UMOUNT_CONNECTED) && + mnt_has_parent(p) && + (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) || + IS_MNT_LOCKED_AND_LAZY(p)); + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, + disconnect ? &unmounted : NULL); if (mnt_has_parent(p)) { - hlist_del_init(&p->mnt_mp_list); - put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); - /* old mountpoint will be dropped when we can do that */ - p->mnt_ex_mountpoint = p->mnt_mountpoint; - p->mnt_mountpoint = p->mnt.mnt_root; - p->mnt_parent = p; - p->mnt_mp = NULL; + if (!disconnect) { + /* Don't forget about p */ + list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); + } else { + umount_mnt(p); + } } change_mnt_propagation(p, MS_PRIVATE); } @@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags) if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 2); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry) namespace_lock(); mp = lookup_mountpoint(dentry); - if (!mp) + if (IS_ERR_OR_NULL(mp)) goto out_unlock; lock_mount_hash(); while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); - umount_tree(mnt, 2); + if (mnt->mnt.mnt_flags & MNT_UMOUNT) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + hlist_add_head(&p->mnt_umount.s_list, &unmounted); + umount_mnt(p); + } + } + else umount_tree(mnt, UMOUNT_CONNECTED); } unlock_mount_hash(); put_mountpoint(mp); @@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, out: if (res) { lock_mount_hash(); - umount_tree(res, 0); + umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return q; @@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; namespace_lock(); - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); + if (!check_mnt(real_mount(path->mnt))) + tree = ERR_PTR(-EINVAL); + else + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); @@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); - umount_tree(real_mount(mnt), 0); + umount_tree(real_mount(mnt), UMOUNT_SYNC); unlock_mount_hash(); namespace_unlock(); } @@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); @@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name, err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); - umount_tree(mnt, 0); + umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: @@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); @@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1); + umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } diff --git a/fs/pnode.c b/fs/pnode.c index 260ac8f898a4..6367e1e435c6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) } /* + * Clear MNT_LOCKED when it can be shown to be safe. + * + * mount_lock lock must be held for write + */ +void propagate_mount_unlock(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m, *child; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + if (child) + child->mnt.mnt_flags &= ~MNT_LOCKED; + } +} + +/* + * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. + */ +static void mark_umount_candidates(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); + if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + SET_MNT_MARK(child); + } + } +} + +/* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ @@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt) struct mount *child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); /* - * umount the child only if the child has no - * other children + * umount the child only if the child has no children + * and the child is marked safe to unmount. */ - if (child && list_empty(&child->mnt_mounts)) { + if (!child || !IS_MNT_MARKED(child)) + continue; + CLEAR_MNT_MARK(child); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); - hlist_del_init_rcu(&child->mnt_hash); - hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); + child->mnt.mnt_flags |= MNT_UMOUNT; + list_move_tail(&child->mnt_list, &mnt->mnt_list); } } } @@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt) * * vfsmount lock must be held for write */ -int propagate_umount(struct hlist_head *list) +int propagate_umount(struct list_head *list) { struct mount *mnt; - hlist_for_each_entry(mnt, list, mnt_hash) + list_for_each_entry_reverse(mnt, list, mnt_list) + mark_umount_candidates(mnt); + + list_for_each_entry(mnt, list, mnt_list) __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 4a246358b031..7114ce6e6b9e 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,6 +19,9 @@ #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) +#define IS_MNT_LOCKED_AND_LAZY(m) \ + (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt) void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); -int propagate_umount(struct hlist_head *); +int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); +void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h index 9dc4e0384bfb..3886b3bffd7f 100644 --- a/include/linux/fs_pin.h +++ b/include/linux/fs_pin.h @@ -13,6 +13,8 @@ struct vfsmount; static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *)) { init_waitqueue_head(&p->wait); + INIT_HLIST_NODE(&p->s_list); + INIT_HLIST_NODE(&p->m_list); p->kill = kill; } diff --git a/include/linux/mount.h b/include/linux/mount.h index bca086d62b1a..f822c3c11377 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -61,6 +61,7 @@ struct mnt_namespace; #define MNT_DOOMED 0x1000000 #define MNT_SYNC_UMOUNT 0x2000000 #define MNT_MARKED 0x4000000 +#define MNT_UMOUNT 0x8000000 struct vfsmount { struct dentry *mnt_root; /* root of the mounted tree */ |