summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/namespace.c85
-rw-r--r--include/uapi/linux/sched.h7
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/nsproxy.c21
4 files changed, 94 insertions, 36 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 702e93243505..555f0a10de9a 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4233,8 +4233,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
struct mnt_namespace *new_ns;
- struct vfsmount *rootmnt __free(mntput) = NULL;
- struct vfsmount *pwdmnt __free(mntput) = NULL;
+ struct path old_root __free(path_put) = {};
+ struct path old_pwd __free(path_put) = {};
struct mount *p, *q;
struct mount *old;
struct mount *new;
@@ -4254,11 +4254,18 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
return new_ns;
guard(namespace_excl)();
- /* First pass: copy the tree topology */
- copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
+
+ if (flags & CLONE_EMPTY_MNTNS)
+ copy_flags = 0;
+ else
+ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
if (user_ns != ns->user_ns)
copy_flags |= CL_SLAVE;
- new = copy_tree(old, old->mnt.mnt_root, copy_flags);
+
+ if (flags & CLONE_EMPTY_MNTNS)
+ new = clone_mnt(old, old->mnt.mnt_root, copy_flags);
+ else
+ new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
emptied_ns = new_ns;
return ERR_CAST(new);
@@ -4269,33 +4276,53 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
}
new_ns->root = new;
- /*
- * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
- * as belonging to new namespace. We have already acquired a private
- * fs_struct, so tsk->fs->lock is not needed.
- */
- p = old;
- q = new;
- while (p) {
- mnt_add_to_ns(new_ns, q);
- new_ns->nr_mounts++;
+ if (flags & CLONE_EMPTY_MNTNS) {
+ /*
+ * Empty mount namespace: only the root mount exists.
+ * Reset root and pwd to the cloned mount's root dentry.
+ */
if (new_fs) {
- if (&p->mnt == new_fs->root.mnt) {
- new_fs->root.mnt = mntget(&q->mnt);
- rootmnt = &p->mnt;
- }
- if (&p->mnt == new_fs->pwd.mnt) {
- new_fs->pwd.mnt = mntget(&q->mnt);
- pwdmnt = &p->mnt;
+ old_root = new_fs->root;
+ old_pwd = new_fs->pwd;
+
+ new_fs->root.mnt = mntget(&new->mnt);
+ new_fs->root.dentry = dget(new->mnt.mnt_root);
+
+ new_fs->pwd.mnt = mntget(&new->mnt);
+ new_fs->pwd.dentry = dget(new->mnt.mnt_root);
+ }
+ mnt_add_to_ns(new_ns, new);
+ new_ns->nr_mounts++;
+ } else {
+ /*
+ * Full copy: walk old and new trees in parallel, switching
+ * the tsk->fs->* elements and marking new vfsmounts as
+ * belonging to new namespace. We have already acquired a
+ * private fs_struct, so tsk->fs->lock is not needed.
+ */
+ p = old;
+ q = new;
+ while (p) {
+ mnt_add_to_ns(new_ns, q);
+ new_ns->nr_mounts++;
+ if (new_fs) {
+ if (&p->mnt == new_fs->root.mnt) {
+ old_root.mnt = new_fs->root.mnt;
+ new_fs->root.mnt = mntget(&q->mnt);
+ }
+ if (&p->mnt == new_fs->pwd.mnt) {
+ old_pwd.mnt = new_fs->pwd.mnt;
+ new_fs->pwd.mnt = mntget(&q->mnt);
+ }
}
+ p = next_mnt(p, old);
+ q = next_mnt(q, new);
+ if (!q)
+ break;
+ // an mntns binding we'd skipped?
+ while (p->mnt.mnt_root != q->mnt.mnt_root)
+ p = next_mnt(skip_mnt_tree(p), old);
}
- p = next_mnt(p, old);
- q = next_mnt(q, new);
- if (!q)
- break;
- // an mntns binding we'd skipped?
- while (p->mnt.mnt_root != q->mnt.mnt_root)
- p = next_mnt(skip_mnt_tree(p), old);
}
ns_tree_add_raw(new_ns);
return new_ns;
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 359a14cc76a4..4e76fce9f777 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,6 +36,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
+#define CLONE_EMPTY_MNTNS (1ULL << 37) /* Create an empty mount namespace. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -43,6 +44,12 @@
*/
#define CLONE_NEWTIME 0x00000080 /* New time namespace */
+/*
+ * unshare flags share the bit space with clone flags but only apply to the
+ * unshare syscall:
+ */
+#define UNSHARE_EMPTY_MNTNS 0x00100000 /* Unshare an empty mount namespace. */
+
#ifndef __ASSEMBLY__
/**
* struct clone_args - arguments for the clone3 syscall
diff --git a/kernel/fork.c b/kernel/fork.c
index 65113a304518..dea6b3454447 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2620,6 +2620,16 @@ pid_t kernel_clone(struct kernel_clone_args *args)
pid_t nr;
/*
+ * Creating an empty mount namespace implies creating a new mount
+ * namespace. Set this before copy_process() so that the
+ * CLONE_NEWNS|CLONE_FS mutual exclusion check works correctly.
+ */
+ if (clone_flags & CLONE_EMPTY_MNTNS) {
+ clone_flags |= CLONE_NEWNS;
+ args->flags = clone_flags;
+ }
+
+ /*
* For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument
* to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are
* mutually exclusive. With clone3() CLONE_PIDFD has grown a separate
@@ -2897,7 +2907,8 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags &
- ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
+ ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND |
+ CLONE_INTO_CGROUP | CLONE_EMPTY_MNTNS))
return false;
/*
@@ -3050,7 +3061,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
- CLONE_NEWTIME))
+ CLONE_NEWTIME | UNSHARE_EMPTY_MNTNS))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
@@ -3149,6 +3160,8 @@ int ksys_unshare(unsigned long unshare_flags)
/*
* If unsharing namespace, must also unshare filesystem information.
*/
+ if (unshare_flags & UNSHARE_EMPTY_MNTNS)
+ unshare_flags |= CLONE_NEWNS;
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 259c4b4f1eeb..1bdc5be2dd20 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -95,7 +95,8 @@ static struct nsproxy *create_new_namespaces(u64 flags,
if (!new_nsp)
return ERR_PTR(-ENOMEM);
- new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
+ new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
+ user_ns, new_fs);
if (IS_ERR(new_nsp->mnt_ns)) {
err = PTR_ERR(new_nsp->mnt_ns);
goto out_ns;
@@ -212,18 +213,28 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs)
{
struct user_namespace *user_ns;
+ u64 flags = unshare_flags;
int err = 0;
- if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
- CLONE_NEWTIME)))
+ if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWTIME)))
return 0;
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
- *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
+ /*
+ * Convert the 32-bit UNSHARE_EMPTY_MNTNS (which aliases
+ * CLONE_PARENT_SETTID) to the unique 64-bit CLONE_EMPTY_MNTNS.
+ */
+ if (flags & UNSHARE_EMPTY_MNTNS) {
+ flags &= ~(u64)UNSHARE_EMPTY_MNTNS;
+ flags |= CLONE_EMPTY_MNTNS;
+ }
+
+ *new_nsp = create_new_namespaces(flags, current, user_ns,
new_fs ? new_fs : current->fs);
if (IS_ERR(*new_nsp)) {
err = PTR_ERR(*new_nsp);