diff options
-rw-r--r-- | fs/namespace.c | 15 | ||||
-rw-r--r-- | fs/nsfs.c | 5 | ||||
-rw-r--r-- | include/linux/mnt_namespace.h | 2 | ||||
-rw-r--r-- | include/linux/nsproxy.h | 24 | ||||
-rw-r--r-- | include/linux/proc_fs.h | 2 | ||||
-rw-r--r-- | include/linux/proc_ns.h | 4 | ||||
-rw-r--r-- | ipc/namespace.c | 7 | ||||
-rw-r--r-- | kernel/cgroup/namespace.c | 5 | ||||
-rw-r--r-- | kernel/nsproxy.c | 305 | ||||
-rw-r--r-- | kernel/pid_namespace.c | 5 | ||||
-rw-r--r-- | kernel/time/namespace.c | 5 | ||||
-rw-r--r-- | kernel/user_namespace.c | 8 | ||||
-rw-r--r-- | kernel/utsname.c | 5 | ||||
-rw-r--r-- | net/core/net_namespace.c | 5 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/.gitignore | 1 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/Makefile | 3 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/config | 6 | ||||
-rw-r--r-- | tools/testing/selftests/pidfd/pidfd_setns_test.c | 473 |
18 files changed, 833 insertions, 47 deletions
diff --git a/fs/namespace.c b/fs/namespace.c index a6baee3c7904..6d499ab254b7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1786,6 +1786,11 @@ static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) return container_of(ns, struct mnt_namespace, ns); } +struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) +{ + return &mnt->ns; +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a @@ -4013,16 +4018,18 @@ static void mntns_put(struct ns_common *ns) put_mnt_ns(to_mnt_ns(ns)); } -static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int mntns_install(struct nsset *nsset, struct ns_common *ns) { - struct fs_struct *fs = current->fs; + struct nsproxy *nsproxy = nsset->nsproxy; + struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; + struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(user_ns, CAP_SYS_CHROOT) || + !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) diff --git a/fs/nsfs.c b/fs/nsfs.c index 4f1205725cfe..800c1d0eb0d0 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -229,6 +229,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, return res; } +bool proc_ns_file(const struct file *file) +{ + return file->f_op == &ns_file_operations; +} + struct file *proc_ns_fget(int fd) { struct file *file; diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 35942084cd40..8f882f5881e8 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -6,10 +6,12 @@ struct mnt_namespace; struct fs_struct; struct user_namespace; +struct ns_common; extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct user_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); +extern struct ns_common *from_mnt_ns(struct mnt_namespace *); extern const struct file_operations proc_mounts_operations; extern const struct file_operations proc_mountinfo_operations; diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h index 074f395b9ad2..cdb171efc7cb 100644 --- a/include/linux/nsproxy.h +++ b/include/linux/nsproxy.h @@ -42,6 +42,30 @@ struct nsproxy { extern struct nsproxy init_nsproxy; /* + * A structure to encompass all bits needed to install + * a partial or complete new set of namespaces. + * + * If a new user namespace is requested cred will + * point to a modifiable set of credentials. If a pointer + * to a modifiable set is needed nsset_cred() must be + * used and tested. + */ +struct nsset { + unsigned flags; + struct nsproxy *nsproxy; + struct fs_struct *fs; + const struct cred *cred; +}; + +static inline struct cred *nsset_cred(struct nsset *set) +{ + if (set->flags & CLONE_NEWUSER) + return (struct cred *)set->cred; + + return NULL; +} + +/* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 45c05fd9c99d..0cfc44d7ac74 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -179,4 +179,6 @@ static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) return inode->i_sb->s_fs_info; } +bool proc_ns_file(const struct file *file); + #endif /* _LINUX_PROC_FS_H */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 6abe85c34681..75807ecef880 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -8,7 +8,7 @@ #include <linux/ns_common.h> struct pid_namespace; -struct nsproxy; +struct nsset; struct path; struct task_struct; struct inode; @@ -19,7 +19,7 @@ struct proc_ns_operations { int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); - int (*install)(struct nsproxy *nsproxy, struct ns_common *ns); + int (*install)(struct nsset *nsset, struct ns_common *ns); struct user_namespace *(*owner)(struct ns_common *ns); struct ns_common *(*get_parent)(struct ns_common *ns); } __randomize_layout; diff --git a/ipc/namespace.c b/ipc/namespace.c index b3ca1476ca51..fdc3b5f3f53a 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -177,15 +177,14 @@ static void ipcns_put(struct ns_common *ns) return put_ipc_ns(to_ipc_ns(ns)); } -static int ipcns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int ipcns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct ipc_namespace *ns = to_ipc_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; - /* Ditch state from the old ipc namespace */ - exit_sem(current); put_ipc_ns(nsproxy->ipc_ns); nsproxy->ipc_ns = get_ipc_ns(ns); return 0; diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index b05f1dd58a62..812a61afd538 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) return container_of(ns, struct cgroup_namespace, ns); } -static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int cgroupns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); - if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) || !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index ed9882108cd2..b03df67621d0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -19,6 +19,8 @@ #include <net/net_namespace.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> +#include <linux/fs_struct.h> +#include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> @@ -257,37 +259,296 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } -SYSCALL_DEFINE2(setns, int, fd, int, nstype) +static int check_setns_flags(unsigned long flags) { - struct task_struct *tsk = current; - struct nsproxy *new_nsproxy; - struct file *file; - struct ns_common *ns; - int err; + if (!flags || (flags & ~(CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET | CLONE_NEWUSER | CLONE_NEWPID | + CLONE_NEWCGROUP))) + return -EINVAL; - file = proc_ns_fget(fd); - if (IS_ERR(file)) - return PTR_ERR(file); +#ifndef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + return -EINVAL; +#endif +#ifndef CONFIG_PID_NS + if (flags & CLONE_NEWPID) + return -EINVAL; +#endif +#ifndef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) + return -EINVAL; +#endif +#ifndef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + return -EINVAL; +#endif +#ifndef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) + return -EINVAL; +#endif +#ifndef CONFIG_NET_NS + if (flags & CLONE_NEWNET) + return -EINVAL; +#endif - err = -EINVAL; - ns = get_proc_ns(file_inode(file)); - if (nstype && (ns->ops->type != nstype)) - goto out; + return 0; +} + +static void put_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + + if (flags & CLONE_NEWUSER) + put_cred(nsset_cred(nsset)); + /* + * We only created a temporary copy if we attached to more than just + * the mount namespace. + */ + if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) + free_fs_struct(nsset->fs); + if (nsset->nsproxy) + free_nsproxy(nsset->nsproxy); +} + +static int prepare_nsset(unsigned flags, struct nsset *nsset) +{ + struct task_struct *me = current; + + nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs); + if (IS_ERR(nsset->nsproxy)) + return PTR_ERR(nsset->nsproxy); - new_nsproxy = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs); - if (IS_ERR(new_nsproxy)) { - err = PTR_ERR(new_nsproxy); + if (flags & CLONE_NEWUSER) + nsset->cred = prepare_creds(); + else + nsset->cred = current_cred(); + if (!nsset->cred) goto out; + + /* Only create a temporary copy of fs_struct if we really need to. */ + if (flags == CLONE_NEWNS) { + nsset->fs = me->fs; + } else if (flags & CLONE_NEWNS) { + nsset->fs = copy_fs_struct(me->fs); + if (!nsset->fs) + goto out; } - err = ns->ops->install(new_nsproxy, ns); - if (err) { - free_nsproxy(new_nsproxy); - goto out; + nsset->flags = flags; + return 0; + +out: + put_nsset(nsset); + return -ENOMEM; +} + +static inline int validate_ns(struct nsset *nsset, struct ns_common *ns) +{ + return ns->ops->install(nsset, ns); +} + +/* + * This is the inverse operation to unshare(). + * Ordering is equivalent to the standard ordering used everywhere else + * during unshare and process creation. The switch to the new set of + * namespaces occurs at the point of no return after installation of + * all requested namespaces was successful in commit_nsset(). + */ +static int validate_nsset(struct nsset *nsset, struct pid *pid) +{ + int ret = 0; + unsigned flags = nsset->flags; + struct user_namespace *user_ns = NULL; + struct pid_namespace *pid_ns = NULL; + struct nsproxy *nsp; + struct task_struct *tsk; + + /* Take a "snapshot" of the target task's namespaces. */ + rcu_read_lock(); + tsk = pid_task(pid, PIDTYPE_PID); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + + if (!ptrace_may_access(tsk, PTRACE_MODE_READ_REALCREDS)) { + rcu_read_unlock(); + return -EPERM; + } + + task_lock(tsk); + nsp = tsk->nsproxy; + if (nsp) + get_nsproxy(nsp); + task_unlock(tsk); + if (!nsp) { + rcu_read_unlock(); + return -ESRCH; + } + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + pid_ns = task_active_pid_ns(tsk); + if (unlikely(!pid_ns)) { + rcu_read_unlock(); + ret = -ESRCH; + goto out; + } + get_pid_ns(pid_ns); + } +#endif + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) + user_ns = get_user_ns(__task_cred(tsk)->user_ns); +#endif + rcu_read_unlock(); + + /* + * Install requested namespaces. The caller will have + * verified earlier that the requested namespaces are + * supported on this kernel. We don't report errors here + * if a namespace is requested that isn't supported. + */ +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + ret = validate_ns(nsset, &user_ns->ns); + if (ret) + goto out; + } +#endif + + if (flags & CLONE_NEWNS) { + ret = validate_ns(nsset, from_mnt_ns(nsp->mnt_ns)); + if (ret) + goto out; + } + +#ifdef CONFIG_UTS_NS + if (flags & CLONE_NEWUTS) { + ret = validate_ns(nsset, &nsp->uts_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) { + ret = validate_ns(nsset, &nsp->ipc_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_PID_NS + if (flags & CLONE_NEWPID) { + ret = validate_ns(nsset, &pid_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_CGROUPS + if (flags & CLONE_NEWCGROUP) { + ret = validate_ns(nsset, &nsp->cgroup_ns->ns); + if (ret) + goto out; + } +#endif + +#ifdef CONFIG_NET_NS + if (flags & CLONE_NEWNET) { + ret = validate_ns(nsset, &nsp->net_ns->ns); + if (ret) + goto out; + } +#endif + +out: + if (pid_ns) + put_pid_ns(pid_ns); + if (nsp) + put_nsproxy(nsp); + put_user_ns(user_ns); + + return ret; +} + +/* + * This is the point of no return. There are just a few namespaces + * that do some actual work here and it's sufficiently minimal that + * a separate ns_common operation seems unnecessary for now. + * Unshare is doing the same thing. If we'll end up needing to do + * more in a given namespace or a helper here is ultimately not + * exported anymore a simple commit handler for each namespace + * should be added to ns_common. + */ +static void commit_nsset(struct nsset *nsset) +{ + unsigned flags = nsset->flags; + struct task_struct *me = current; + +#ifdef CONFIG_USER_NS + if (flags & CLONE_NEWUSER) { + /* transfer ownership */ + commit_creds(nsset_cred(nsset)); + nsset->cred = NULL; + } +#endif + + /* We only need to commit if we have used a temporary fs_struct. */ + if ((flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) { + set_fs_root(me->fs, &nsset->fs->root); + set_fs_pwd(me->fs, &nsset->fs->pwd); } - switch_task_namespaces(tsk, new_nsproxy); - perf_event_namespaces(tsk); +#ifdef CONFIG_IPC_NS + if (flags & CLONE_NEWIPC) + exit_sem(me); +#endif + + /* transfer ownership */ + switch_task_namespaces(me, nsset->nsproxy); + nsset->nsproxy = NULL; +} + +SYSCALL_DEFINE2(setns, int, fd, int, flags) +{ + struct file *file; + struct ns_common *ns = NULL; + struct nsset nsset = {}; + int err = 0; + + file = fget(fd); + if (!file) + return -EBADF; + + if (proc_ns_file(file)) { + ns = get_proc_ns(file_inode(file)); + if (flags && (ns->ops->type != flags)) + err = -EINVAL; + flags = ns->ops->type; + } else if (!IS_ERR(pidfd_pid(file))) { + err = check_setns_flags(flags); + } else { + err = -EBADF; + } + if (err) + goto out; + + err = prepare_nsset(flags, &nsset); + if (err) + goto out; + + if (proc_ns_file(file)) + err = validate_ns(&nsset, ns); + else + err = validate_nsset(&nsset, file->private_data); + if (!err) { + commit_nsset(&nsset); + perf_event_namespaces(current); + } + put_nsset(&nsset); out: fput(file); return err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..11db2bdbb41e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -378,13 +378,14 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } -static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int pidns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); struct pid_namespace *ancestor, *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 53bce347cd50..5d9fc22d836a 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -280,8 +280,9 @@ static void timens_put(struct ns_common *ns) put_time_ns(to_time_ns(ns)); } -static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) +static int timens_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct time_namespace *ns = to_time_ns(new); int err; @@ -289,7 +290,7 @@ static int timens_install(struct nsproxy *nsproxy, struct ns_common *new) return -EUSERS; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; timens_set_vvar_page(current, ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8eadadc478f9..87804e0371fe 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1253,7 +1253,7 @@ static void userns_put(struct ns_common *ns) put_user_ns(to_user_ns(ns)); } -static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; @@ -1274,14 +1274,14 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns) if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; - cred = prepare_creds(); + cred = nsset_cred(nsset); if (!cred) - return -ENOMEM; + return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); - return commit_creds(cred); + return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) diff --git a/kernel/utsname.c b/kernel/utsname.c index f0e491193009..e488d0e2ab45 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -140,12 +140,13 @@ static void utsns_put(struct ns_common *ns) put_uts_ns(to_uts_ns(ns)); } -static int utsns_install(struct nsproxy *nsproxy, struct ns_common *new) +static int utsns_install(struct nsset *nsset, struct ns_common *new) { + struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 190ca66a383b..dcd61aca343e 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -1353,12 +1353,13 @@ static void netns_put(struct ns_common *ns) put_net(to_net_ns(ns)); } -static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns) +static int netns_install(struct nsset *nsset, struct ns_common *ns) { + struct nsproxy *nsproxy = nsset->nsproxy; struct net *net = to_net_ns(ns); if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) + !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 2d4db5afb142..973198a3ec3d 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -5,3 +5,4 @@ pidfd_test pidfd_wait pidfd_fdinfo_test pidfd_getfd_test +pidfd_setns_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 75a545861375..f4a2f28f926b 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,7 +1,8 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ -pthread -TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait pidfd_getfd_test +TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test \ + pidfd_poll_test pidfd_wait pidfd_getfd_test pidfd_setns_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/config b/tools/testing/selftests/pidfd/config new file mode 100644 index 000000000000..bb11de90c0c9 --- /dev/null +++ b/tools/testing/selftests/pidfd/config @@ -0,0 +1,6 @@ +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CGROUPS=y diff --git a/tools/testing/selftests/pidfd/pidfd_setns_test.c b/tools/testing/selftests/pidfd/pidfd_setns_test.c new file mode 100644 index 000000000000..133ec5b6cda8 --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_setns_test.c @@ -0,0 +1,473 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <linux/types.h> +#include <sched.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <syscall.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/stat.h> +#include <linux/kcmp.h> + +#include "pidfd.h" +#include "../clone3/clone3_selftests.h" +#include "../kselftest.h" +#include "../kselftest_harness.h" + +enum { + PIDFD_NS_USER, + PIDFD_NS_MNT, + PIDFD_NS_PID, + PIDFD_NS_UTS, + PIDFD_NS_IPC, + PIDFD_NS_NET, + PIDFD_NS_CGROUP, + PIDFD_NS_PIDCLD, + PIDFD_NS_MAX +}; + +const struct ns_info { + const char *name; + int flag; +} ns_info[] = { + [PIDFD_NS_USER] = { "user", CLONE_NEWUSER, }, + [PIDFD_NS_MNT] = { "mnt", CLONE_NEWNS, }, + [PIDFD_NS_PID] = { "pid", CLONE_NEWPID, }, + [PIDFD_NS_UTS] = { "uts", CLONE_NEWUTS, }, + [PIDFD_NS_IPC] = { "ipc", CLONE_NEWIPC, }, + [PIDFD_NS_NET] = { "net", CLONE_NEWNET, }, + [PIDFD_NS_CGROUP] = { "cgroup", CLONE_NEWCGROUP, }, + [PIDFD_NS_PIDCLD] = { "pid_for_children", 0, }, +}; + +FIXTURE(current_nsset) +{ + pid_t pid; + int pidfd; + int nsfds[PIDFD_NS_MAX]; + + pid_t child_pid_exited; + int child_pidfd_exited; + + pid_t child_pid1; + int child_pidfd1; + int child_nsfds1[PIDFD_NS_MAX]; + + pid_t child_pid2; + int child_pidfd2; + int child_nsfds2[PIDFD_NS_MAX]; +}; + +static int sys_waitid(int which, pid_t pid, int options) +{ + return syscall(__NR_waitid, which, pid, NULL, options, NULL); +} + +pid_t create_child(int *pidfd, unsigned flags) +{ + struct clone_args args = { + .flags = CLONE_PIDFD | flags, + .exit_signal = SIGCHLD, + .pidfd = ptr_to_u64(pidfd), + }; + + return sys_clone3(&args, sizeof(struct clone_args)); +} + +FIXTURE_SETUP(current_nsset) +{ + int i, proc_fd, ret; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + self->nsfds[i] = -EBADF; + self->child_nsfds1[i] = -EBADF; + self->child_nsfds2[i] = -EBADF; + } + + proc_fd = open("/proc/self/ns", O_DIRECTORY | O_CLOEXEC); + ASSERT_GE(proc_fd, 0) { + TH_LOG("%m - Failed to open /proc/self/ns"); + } + + self->pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + } + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GT(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create task that exits right away. */ + self->child_pid_exited = create_child(&self->child_pidfd_exited, + CLONE_NEWUSER | CLONE_NEWNET); + EXPECT_GT(self->child_pid_exited, 0); + + if (self->child_pid_exited == 0) + _exit(EXIT_SUCCESS); + + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED | WNOWAIT), 0); + + self->pidfd = sys_pidfd_open(self->pid, 0); + EXPECT_GE(self->pidfd, 0) { + TH_LOG("%m - Failed to open pidfd for process %d", self->pid); + } + + /* Create tasks that will be stopped. */ + self->child_pid1 = create_child(&self->child_pidfd1, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid1, 0); + + if (self->child_pid1 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + self->child_pid2 = create_child(&self->child_pidfd2, + CLONE_NEWUSER | CLONE_NEWNS | + CLONE_NEWCGROUP | CLONE_NEWIPC | + CLONE_NEWUTS | CLONE_NEWPID | + CLONE_NEWNET); + EXPECT_GE(self->child_pid2, 0); + + if (self->child_pid2 == 0) { + pause(); + _exit(EXIT_SUCCESS); + } + + for (i = 0; i < PIDFD_NS_MAX; i++) { + char p[100]; + + const struct ns_info *info = &ns_info[i]; + + self->nsfds[i] = openat(proc_fd, info->name, O_RDONLY | O_CLOEXEC); + if (self->nsfds[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->pid); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid1, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds1[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds1[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + + ret = snprintf(p, sizeof(p), "/proc/%d/ns/%s", + self->child_pid2, info->name); + EXPECT_GT(ret, 0); + EXPECT_LT(ret, sizeof(p)); + + self->child_nsfds2[i] = open(p, O_RDONLY | O_CLOEXEC); + if (self->child_nsfds2[i] < 0) { + EXPECT_EQ(errno, ENOENT) { + TH_LOG("%m - Failed to open %s namespace for process %d", + info->name, self->child_pid1); + } + } + } + + close(proc_fd); +} + +FIXTURE_TEARDOWN(current_nsset) +{ + int i; + + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd1, + SIGKILL, NULL, 0), 0); + ASSERT_EQ(sys_pidfd_send_signal(self->child_pidfd2, + SIGKILL, NULL, 0), 0); + + for (i = 0; i < PIDFD_NS_MAX; i++) { + if (self->nsfds[i] >= 0) + close(self->nsfds[i]); + if (self->child_nsfds1[i] >= 0) + close(self->child_nsfds1[i]); + if (self->child_nsfds2[i] >= 0) + close(self->child_nsfds2[i]); + } + + if (self->child_pidfd1 >= 0) + EXPECT_EQ(0, close(self->child_pidfd1)); + if (self->child_pidfd2 >= 0) + EXPECT_EQ(0, close(self->child_pidfd2)); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid_exited, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid1, WEXITED), 0); + ASSERT_EQ(sys_waitid(P_PID, self->child_pid2, WEXITED), 0); +} + +static int preserve_ns(const int pid, const char *ns) +{ + int ret; + char path[50]; + + ret = snprintf(path, sizeof(path), "/proc/%d/ns/%s", pid, ns); + if (ret < 0 || (size_t)ret >= sizeof(path)) + return -EIO; + + return open(path, O_RDONLY | O_CLOEXEC); +} + +static int in_same_namespace(int ns_fd1, pid_t pid2, const char *ns) +{ + int ns_fd2 = -EBADF; + int ret = -1; + struct stat ns_st1, ns_st2; + + ret = fstat(ns_fd1, &ns_st1); + if (ret < 0) + return -1; + + ns_fd2 = preserve_ns(pid2, ns); + if (ns_fd2 < 0) + return -1; + + ret = fstat(ns_fd2, &ns_st2); + close(ns_fd2); + if (ret < 0) + return -1; + + /* processes are in the same namespace */ + if ((ns_st1.st_dev == ns_st2.st_dev) && + (ns_st1.st_ino == ns_st2.st_ino)) + return 1; + + /* processes are in different namespaces */ + return 0; +} + +/* Test that we can't pass garbage to the kernel. */ +TEST_F(current_nsset, invalid_flags) +{ + ASSERT_NE(setns(self->pidfd, 0), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, -1), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); + + ASSERT_NE(setns(self->pidfd, CLONE_NEWUSER | CLONE_VM), 0); + EXPECT_EQ(errno, EINVAL); +} + +/* Test that we can't attach to a task that has already exited. */ +TEST_F(current_nsset, pidfd_exited_child) +{ + int i; + pid_t pid; + + ASSERT_NE(setns(self->child_pidfd_exited, CLONE_NEWUSER | CLONE_NEWNET), + 0); + EXPECT_EQ(errno, ESRCH); + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + /* Verify that we haven't changed any namespaces. */ + if (self->nsfds[i] >= 0) + ASSERT_EQ(in_same_namespace(self->nsfds[i], pid, info->name), 1); + } +} + +TEST_F(current_nsset, pidfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_pidfd1, info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via pidfd %d", + info->name, self->child_pid1, + self->child_pidfd1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid1, self->child_pidfd1); + } +} + +TEST_F(current_nsset, nsfd_incremental_setns) +{ + int i; + pid_t pid; + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + if (info->flag) { + ASSERT_EQ(setns(self->child_nsfds1[i], info->flag), 0) { + TH_LOG("%m - Failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + } + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d via nsfd %d", + info->name, self->child_pid1, + self->child_nsfds1[i]); + } + TH_LOG("Managed to correctly setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid1, self->child_nsfds1[i]); + } +} + +TEST_F(current_nsset, pidfd_one_shot_setns) +{ + unsigned flags = 0; + int i; + pid_t pid; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d", + self->child_pid1); + } + + pid = getpid(); + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + int nsfd; + + if (self->child_nsfds1[i] < 0) + continue; + + /* Verify that we have changed to the correct namespaces. */ + if (info->flag == CLONE_NEWPID) + nsfd = self->nsfds[i]; + else + nsfd = self->child_nsfds1[i]; + ASSERT_EQ(in_same_namespace(nsfd, pid, info->name), 1) { + TH_LOG("setns failed to place us correctly into %s namespace of %d", + info->name, self->child_pid1); + } + TH_LOG("Managed to correctly setns to %s namespace of %d", + info->name, self->child_pid1); + } +} + +TEST_F(current_nsset, no_foul_play) +{ + unsigned flags = 0; + int i; + + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds1[i] < 0) + continue; + + flags |= info->flag; + if (info->flag) /* No use logging pid_for_children. */ + TH_LOG("Adding %s namespace of %d to list of namespaces to attach to", + info->name, self->child_pid1); + } + + ASSERT_EQ(setns(self->child_pidfd1, flags), 0) { + TH_LOG("%m - Failed to setns to namespaces of %d vid pidfd %d", + self->child_pid1, self->child_pidfd1); + } + + /* + * Can't setns to a user namespace outside of our hierarchy since we + * don't have caps in there and didn't create it. That means that under + * no circumstances should we be able to setns to any of the other + * ones since they aren't owned by our user namespace. + */ + for (i = 0; i < PIDFD_NS_MAX; i++) { + const struct ns_info *info = &ns_info[i]; + + if (self->child_nsfds2[i] < 0 || !info->flag) + continue; + + ASSERT_NE(setns(self->child_pidfd2, info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via pidfd %d", + info->name, self->child_pid2, + self->child_pidfd2); + + ASSERT_NE(setns(self->child_nsfds2[i], info->flag), 0) { + TH_LOG("Managed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } + TH_LOG("%m - Correctly failed to setns to %s namespace of %d via nsfd %d", + info->name, self->child_pid2, + self->child_nsfds2[i]); + } +} + +TEST_HARNESS_MAIN |