summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-04-18 18:20:31 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2015-04-18 18:20:31 +0300
commit8f502d5b9e3362971f58dad5d468f070340336e1 (patch)
treefa819b5acdfc43c01e3696991ffe90836f265a34
parent06a60deca87dba8e2c186ea7f12ea87d6785188e (diff)
parente0c9c0afd2fc958ffa34b697972721d81df8a56f (diff)
downloadlinux-8f502d5b9e3362971f58dad5d468f070340336e1.tar.xz
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace
Pull usernamespace mount fixes from Eric Biederman: "Way back in October Andrey Vagin reported that umount(MNT_DETACH) could be used to defeat MNT_LOCKED. As I worked to fix this I discovered that combined with mount propagation and an appropriate selection of shared subtrees a reference to a directory on an unmounted filesystem is not necessary. That MNT_DETACH is allowed in user namespace in a form that can break MNT_LOCKED comes from my early misunderstanding what MNT_DETACH does. To avoid breaking existing userspace the conflict between MNT_DETACH and MNT_LOCKED is fixed by leaving mounts that are locked to their parents in the mount hash table until the last reference goes away. While investigating this issue I also found an issue with __detach_mounts. The code was unnecessarily and incorrectly triggering mount propagation. Resulting in too many mounts going away when a directory is deleted, and too many cpu cycles are burned while doing that. Looking some more I realized that __detach_mounts by only keeping mounts connected that were MNT_LOCKED it had the potential to still leak information so I tweaked the code to keep everything locked together that possibly could be. This code was almost ready last cycle but Al invented fs_pin which slightly simplifies this code but required rewrites and retesting, and I have not been in top form for a while so it took me a while to get all of that done. Similiarly this pull request is late because I have been feeling absolutely miserable all week. The issue of being able to escape a bind mount has not yet been addressed, as the fixes are not yet mature" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace: mnt: Update detach_mounts to leave mounts connected mnt: Fix the error check in __detach_mounts mnt: Honor MNT_LOCKED when detaching mounts fs_pin: Allow for the possibility that m_list or s_list go unused. mnt: Factor umount_mnt from umount_tree mnt: Factor out unhash_mnt from detach_mnt and umount_tree mnt: Fail collect_mounts when applied to unmounted mounts mnt: Don't propagate unmounts to locked mounts mnt: On an unmount propagate clearing of MNT_LOCKED mnt: Delay removal from the mount hash. mnt: Add MNT_UMOUNT flag mnt: In umount_tree reuse mnt_list instead of mnt_hash mnt: Don't propagate umounts in __detach_mounts mnt: Improve the umount_tree flags mnt: Use hlist_move_list in namespace_unlock
-rw-r--r--fs/fs_pin.c4
-rw-r--r--fs/namespace.c142
-rw-r--r--fs/pnode.c60
-rw-r--r--fs/pnode.h7
-rw-r--r--include/linux/fs_pin.h2
-rw-r--r--include/linux/mount.h1
6 files changed, 159 insertions, 57 deletions
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index b06c98796afb..611b5408f6ec 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock);
void pin_remove(struct fs_pin *pin)
{
spin_lock(&pin_lock);
- hlist_del(&pin->m_list);
- hlist_del(&pin->s_list);
+ hlist_del_init(&pin->m_list);
+ hlist_del_init(&pin->s_list);
spin_unlock(&pin_lock);
spin_lock_irq(&pin->wait.lock);
pin->done = 1;
diff --git a/fs/namespace.c b/fs/namespace.c
index 82ef1405260e..1f4f9dac6e5a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -632,14 +632,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
*/
struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
{
- struct mount *p, *res;
- res = p = __lookup_mnt(mnt, dentry);
+ struct mount *p, *res = NULL;
+ p = __lookup_mnt(mnt, dentry);
if (!p)
goto out;
+ if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+ res = p;
hlist_for_each_entry_continue(p, mnt_hash) {
if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
break;
- res = p;
+ if (!(p->mnt.mnt_flags & MNT_UMOUNT))
+ res = p;
}
out:
return res;
@@ -795,10 +798,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
/*
* vfsmount lock must be held for write
*/
-static void detach_mnt(struct mount *mnt, struct path *old_path)
+static void unhash_mnt(struct mount *mnt)
{
- old_path->dentry = mnt->mnt_mountpoint;
- old_path->mnt = &mnt->mnt_parent->mnt;
mnt->mnt_parent = mnt;
mnt->mnt_mountpoint = mnt->mnt.mnt_root;
list_del_init(&mnt->mnt_child);
@@ -811,6 +812,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path)
/*
* vfsmount lock must be held for write
*/
+static void detach_mnt(struct mount *mnt, struct path *old_path)
+{
+ old_path->dentry = mnt->mnt_mountpoint;
+ old_path->mnt = &mnt->mnt_parent->mnt;
+ unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
+static void umount_mnt(struct mount *mnt)
+{
+ /* old mountpoint will be dropped when we can do that */
+ mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint;
+ unhash_mnt(mnt);
+}
+
+/*
+ * vfsmount lock must be held for write
+ */
void mnt_set_mountpoint(struct mount *mnt,
struct mountpoint *mp,
struct mount *child_mnt)
@@ -1078,6 +1099,13 @@ static void mntput_no_expire(struct mount *mnt)
rcu_read_unlock();
list_del(&mnt->mnt_instance);
+
+ if (unlikely(!list_empty(&mnt->mnt_mounts))) {
+ struct mount *p, *tmp;
+ list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
+ umount_mnt(p);
+ }
+ }
unlock_mount_hash();
if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
@@ -1298,17 +1326,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static void namespace_unlock(void)
{
- struct hlist_head head = unmounted;
+ struct hlist_head head;
- if (likely(hlist_empty(&head))) {
- up_write(&namespace_sem);
- return;
- }
+ hlist_move_list(&unmounted, &head);
- head.first->pprev = &head.first;
- INIT_HLIST_HEAD(&unmounted);
up_write(&namespace_sem);
+ if (likely(hlist_empty(&head)))
+ return;
+
synchronize_rcu();
group_pin_kill(&head);
@@ -1319,49 +1345,63 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}
+enum umount_tree_flags {
+ UMOUNT_SYNC = 1,
+ UMOUNT_PROPAGATE = 2,
+ UMOUNT_CONNECTED = 4,
+};
/*
* mount_lock must be held
* namespace_sem must be held for write
- * how = 0 => just this tree, don't propagate
- * how = 1 => propagate; we know that nobody else has reference to any victims
- * how = 2 => lazy umount
*/
-void umount_tree(struct mount *mnt, int how)
+static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
{
- HLIST_HEAD(tmp_list);
+ LIST_HEAD(tmp_list);
struct mount *p;
+ if (how & UMOUNT_PROPAGATE)
+ propagate_mount_unlock(mnt);
+
+ /* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
- hlist_del_init_rcu(&p->mnt_hash);
- hlist_add_head(&p->mnt_hash, &tmp_list);
+ p->mnt.mnt_flags |= MNT_UMOUNT;
+ list_move(&p->mnt_list, &tmp_list);
}
- hlist_for_each_entry(p, &tmp_list, mnt_hash)
+ /* Hide the mounts from mnt_mounts */
+ list_for_each_entry(p, &tmp_list, mnt_list) {
list_del_init(&p->mnt_child);
+ }
- if (how)
+ /* Add propogated mounts to the tmp_list */
+ if (how & UMOUNT_PROPAGATE)
propagate_umount(&tmp_list);
- while (!hlist_empty(&tmp_list)) {
- p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
- hlist_del_init_rcu(&p->mnt_hash);
+ while (!list_empty(&tmp_list)) {
+ bool disconnect;
+ p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
- if (how < 2)
+ if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
- pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
+ disconnect = !(((how & UMOUNT_CONNECTED) &&
+ mnt_has_parent(p) &&
+ (p->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) ||
+ IS_MNT_LOCKED_AND_LAZY(p));
+
+ pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt,
+ disconnect ? &unmounted : NULL);
if (mnt_has_parent(p)) {
- hlist_del_init(&p->mnt_mp_list);
- put_mountpoint(p->mnt_mp);
mnt_add_count(p->mnt_parent, -1);
- /* old mountpoint will be dropped when we can do that */
- p->mnt_ex_mountpoint = p->mnt_mountpoint;
- p->mnt_mountpoint = p->mnt.mnt_root;
- p->mnt_parent = p;
- p->mnt_mp = NULL;
+ if (!disconnect) {
+ /* Don't forget about p */
+ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
+ } else {
+ umount_mnt(p);
+ }
}
change_mnt_propagation(p, MS_PRIVATE);
}
@@ -1447,14 +1487,14 @@ static int do_umount(struct mount *mnt, int flags)
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 2);
+ umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
if (!list_empty(&mnt->mnt_list))
- umount_tree(mnt, 1);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
}
@@ -1480,13 +1520,20 @@ void __detach_mounts(struct dentry *dentry)
namespace_lock();
mp = lookup_mountpoint(dentry);
- if (!mp)
+ if (IS_ERR_OR_NULL(mp))
goto out_unlock;
lock_mount_hash();
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
- umount_tree(mnt, 2);
+ if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
+ struct mount *p, *tmp;
+ list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) {
+ hlist_add_head(&p->mnt_umount.s_list, &unmounted);
+ umount_mnt(p);
+ }
+ }
+ else umount_tree(mnt, UMOUNT_CONNECTED);
}
unlock_mount_hash();
put_mountpoint(mp);
@@ -1648,7 +1695,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
out:
if (res) {
lock_mount_hash();
- umount_tree(res, 0);
+ umount_tree(res, UMOUNT_SYNC);
unlock_mount_hash();
}
return q;
@@ -1660,8 +1707,11 @@ struct vfsmount *collect_mounts(struct path *path)
{
struct mount *tree;
namespace_lock();
- tree = copy_tree(real_mount(path->mnt), path->dentry,
- CL_COPY_ALL | CL_PRIVATE);
+ if (!check_mnt(real_mount(path->mnt)))
+ tree = ERR_PTR(-EINVAL);
+ else
+ tree = copy_tree(real_mount(path->mnt), path->dentry,
+ CL_COPY_ALL | CL_PRIVATE);
namespace_unlock();
if (IS_ERR(tree))
return ERR_CAST(tree);
@@ -1672,7 +1722,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
{
namespace_lock();
lock_mount_hash();
- umount_tree(real_mount(mnt), 0);
+ umount_tree(real_mount(mnt), UMOUNT_SYNC);
unlock_mount_hash();
namespace_unlock();
}
@@ -1855,7 +1905,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
- umount_tree(child, 0);
+ umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
@@ -2035,7 +2085,7 @@ static int do_loopback(struct path *path, const char *old_name,
err = graft_tree(mnt, parent, mp);
if (err) {
lock_mount_hash();
- umount_tree(mnt, 0);
+ umount_tree(mnt, UMOUNT_SYNC);
unlock_mount_hash();
}
out2:
@@ -2406,7 +2456,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
while (!list_empty(&graveyard)) {
mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
touch_mnt_namespace(mnt->mnt_ns);
- umount_tree(mnt, 1);
+ umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
unlock_mount_hash();
namespace_unlock();
@@ -2477,7 +2527,7 @@ static void shrink_submounts(struct mount *mnt)
m = list_first_entry(&graveyard, struct mount,
mnt_expire);
touch_mnt_namespace(m->mnt_ns);
- umount_tree(m, 1);
+ umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
}
}
}
diff --git a/fs/pnode.c b/fs/pnode.c
index 260ac8f898a4..6367e1e435c6 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt)
}
/*
+ * Clear MNT_LOCKED when it can be shown to be safe.
+ *
+ * mount_lock lock must be held for write
+ */
+void propagate_mount_unlock(struct mount *mnt)
+{
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m, *child;
+
+ BUG_ON(parent == mnt);
+
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint);
+ if (child)
+ child->mnt.mnt_flags &= ~MNT_LOCKED;
+ }
+}
+
+/*
+ * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted.
+ */
+static void mark_umount_candidates(struct mount *mnt)
+{
+ struct mount *parent = mnt->mnt_parent;
+ struct mount *m;
+
+ BUG_ON(parent == mnt);
+
+ for (m = propagation_next(parent, parent); m;
+ m = propagation_next(m, parent)) {
+ struct mount *child = __lookup_mnt_last(&m->mnt,
+ mnt->mnt_mountpoint);
+ if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) {
+ SET_MNT_MARK(child);
+ }
+ }
+}
+
+/*
* NOTE: unmounting 'mnt' naturally propagates to all other mounts its
* parent propagates to.
*/
@@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt)
struct mount *child = __lookup_mnt_last(&m->mnt,
mnt->mnt_mountpoint);
/*
- * umount the child only if the child has no
- * other children
+ * umount the child only if the child has no children
+ * and the child is marked safe to unmount.
*/
- if (child && list_empty(&child->mnt_mounts)) {
+ if (!child || !IS_MNT_MARKED(child))
+ continue;
+ CLEAR_MNT_MARK(child);
+ if (list_empty(&child->mnt_mounts)) {
list_del_init(&child->mnt_child);
- hlist_del_init_rcu(&child->mnt_hash);
- hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash);
+ child->mnt.mnt_flags |= MNT_UMOUNT;
+ list_move_tail(&child->mnt_list, &mnt->mnt_list);
}
}
}
@@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt)
*
* vfsmount lock must be held for write
*/
-int propagate_umount(struct hlist_head *list)
+int propagate_umount(struct list_head *list)
{
struct mount *mnt;
- hlist_for_each_entry(mnt, list, mnt_hash)
+ list_for_each_entry_reverse(mnt, list, mnt_list)
+ mark_umount_candidates(mnt);
+
+ list_for_each_entry(mnt, list, mnt_list)
__propagate_umount(mnt);
return 0;
}
diff --git a/fs/pnode.h b/fs/pnode.h
index 4a246358b031..7114ce6e6b9e 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -19,6 +19,9 @@
#define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
#define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
#define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
+#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED)
+#define IS_MNT_LOCKED_AND_LAZY(m) \
+ (((m)->mnt.mnt_flags & (MNT_LOCKED|MNT_SYNC_UMOUNT)) == MNT_LOCKED)
#define CL_EXPIRE 0x01
#define CL_SLAVE 0x02
@@ -40,14 +43,14 @@ static inline void set_mnt_shared(struct mount *mnt)
void change_mnt_propagation(struct mount *, int);
int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
struct hlist_head *);
-int propagate_umount(struct hlist_head *);
+int propagate_umount(struct list_head *);
int propagate_mount_busy(struct mount *, int);
+void propagate_mount_unlock(struct mount *);
void mnt_release_group_id(struct mount *);
int get_dominating_id(struct mount *mnt, const struct path *root);
unsigned int mnt_get_count(struct mount *mnt);
void mnt_set_mountpoint(struct mount *, struct mountpoint *,
struct mount *);
-void umount_tree(struct mount *, int);
struct mount *copy_tree(struct mount *, struct dentry *, int);
bool is_path_reachable(struct mount *, struct dentry *,
const struct path *root);
diff --git a/include/linux/fs_pin.h b/include/linux/fs_pin.h
index 9dc4e0384bfb..3886b3bffd7f 100644
--- a/include/linux/fs_pin.h
+++ b/include/linux/fs_pin.h
@@ -13,6 +13,8 @@ struct vfsmount;
static inline void init_fs_pin(struct fs_pin *p, void (*kill)(struct fs_pin *))
{
init_waitqueue_head(&p->wait);
+ INIT_HLIST_NODE(&p->s_list);
+ INIT_HLIST_NODE(&p->m_list);
p->kill = kill;
}
diff --git a/include/linux/mount.h b/include/linux/mount.h
index bca086d62b1a..f822c3c11377 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -61,6 +61,7 @@ struct mnt_namespace;
#define MNT_DOOMED 0x1000000
#define MNT_SYNC_UMOUNT 0x2000000
#define MNT_MARKED 0x4000000
+#define MNT_UMOUNT 0x8000000
struct vfsmount {
struct dentry *mnt_root; /* root of the mounted tree */