From 15d42eb26bdee47c0278fbdab4198577bc6a97b5 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 14 Oct 2019 18:20:32 +0200 Subject: pidfd: add NSpid entries to fdinfo Currently, the fdinfo file contains the Pid field which shows the pid a given pidfd refers to in the pid namespace of the procfs instance. If pid namespaces are configured, also show an NSpid field for easy retrieval of the pid in all descendant pid namespaces. If the pid namespace of the process is not a descendant of the pid namespace of the procfs instance 0 will be shown as its first NSpid entry and no other entries will be shown. Add a block comment to pidfd_show_fdinfo with a detailed explanation of Pid and NSpid fields. Co-developed-by: Christian Brauner Signed-off-by: Christian Brauner Signed-off-by: Christian Kellner Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191014162034.2185-1-ckellner@redhat.com Signed-off-by: Christian Brauner --- kernel/fork.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index bcdf53125210..782986962d47 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1695,12 +1695,63 @@ static int pidfd_release(struct inode *inode, struct file *file) } #ifdef CONFIG_PROC_FS +/** + * pidfd_show_fdinfo - print information about a pidfd + * @m: proc fdinfo file + * @f: file referencing a pidfd + * + * Pid: + * This function will print the pid that a given pidfd refers to in the + * pid namespace of the procfs instance. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its pid. This is + * similar to calling getppid() on a process whose parent is outside of + * its pid namespace. + * + * NSpid: + * If pid namespaces are supported then this function will also print + * the pid of a given pidfd refers to for all descendant pid namespaces + * starting from the current pid namespace of the instance, i.e. the + * Pid field and the first entry in the NSpid field will be identical. + * If the pid namespace of the process is not a descendant of the pid + * namespace of the procfs instance 0 will be shown as its first NSpid + * entry and no others will be shown. + * Note that this differs from the Pid and NSpid fields in + * /proc//status where Pid and NSpid are always shown relative to + * the pid namespace of the procfs instance. The difference becomes + * obvious when sending around a pidfd between pid namespaces from a + * different branch of the tree, i.e. where no ancestoral relation is + * present between the pid namespaces: + * - create two new pid namespaces ns1 and ns2 in the initial pid + * namespace (also take care to create new mount namespaces in the + * new pid namespace and mount procfs) + * - create a process with a pidfd in ns1 + * - send pidfd from ns1 to ns2 + * - read /proc/self/fdinfo/ and observe that both Pid and NSpid + * have exactly one entry, which is 0 + */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; + pid_t nr = pid_nr_ns(pid, ns); + + seq_put_decimal_ull(m, "Pid:\t", nr); - seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns)); +#ifdef CONFIG_PID_NS + seq_put_decimal_ull(m, "\nNSpid:\t", nr); + if (nr) { + int i; + + /* If nr is non-zero it means that 'pid' is valid and that + * ns, i.e. the pid namespace associated with the procfs + * instance, is in the pid namespace hierarchy of pid. + * Start at one below the already printed level. + */ + for (i = ns->level + 1; i <= pid->level; i++) + seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + } +#endif seq_putc(m, '\n'); } #endif -- cgit v1.2.3 From 2def297ec7fbf68cedc48f69e1f600fef13f2e96 Mon Sep 17 00:00:00 2001 From: Christian Kellner Date: Mon, 14 Oct 2019 18:20:33 +0200 Subject: pidfd: add tests for NSpid info in fdinfo Add a test that checks that if pid namespaces are configured the fdinfo file of a pidfd contains an NSpid: entry containing the process id in the current and additionally all nested namespaces. In the case that a pidfd is from a pid namespace not in the same namespace hierarchy as the process accessing the fdinfo file, ensure the 'NSpid' shows 0 for that pidfd, analogous to the 'Pid' entry. Signed-off-by: Christian Kellner Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20191014162034.2185-2-ckellner@redhat.com Signed-off-by: Christian Brauner --- tools/testing/selftests/pidfd/Makefile | 2 +- tools/testing/selftests/pidfd/pidfd_fdinfo_test.c | 265 ++++++++++++++++++++++ 2 files changed, 266 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/pidfd/pidfd_fdinfo_test.c diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index 7550f08822a3..43db1b98e845 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only CFLAGS += -g -I../../../../usr/include/ -pthread -TEST_GEN_PROGS := pidfd_test pidfd_open_test pidfd_poll_test pidfd_wait +TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c new file mode 100644 index 000000000000..3721be994abd --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest.h" + +struct error { + int code; + char msg[512]; +}; + +static int error_set(struct error *err, int code, const char *fmt, ...) +{ + va_list args; + int r; + + if (code == PIDFD_PASS || !err || err->code != PIDFD_PASS) + return code; + + err->code = code; + va_start(args, fmt); + r = vsnprintf(err->msg, sizeof(err->msg), fmt, args); + assert((size_t)r < sizeof(err->msg)); + va_end(args); + + return code; +} + +static void error_report(struct error *err, const char *test_name) +{ + switch (err->code) { + case PIDFD_ERROR: + ksft_exit_fail_msg("%s test: Fatal: %s\n", test_name, err->msg); + break; + + case PIDFD_FAIL: + /* will be: not ok %d # error %s test: %s */ + ksft_test_result_error("%s test: %s\n", test_name, err->msg); + break; + + case PIDFD_SKIP: + /* will be: not ok %d # SKIP %s test: %s */ + ksft_test_result_skip("%s test: %s\n", test_name, err->msg); + break; + + case PIDFD_XFAIL: + ksft_test_result_pass("%s test: Expected failure: %s\n", + test_name, err->msg); + break; + + case PIDFD_PASS: + ksft_test_result_pass("%s test: Passed\n"); + break; + + default: + ksft_exit_fail_msg("%s test: Unknown code: %d %s\n", + test_name, err->code, err->msg); + break; + } +} + +static inline int error_check(struct error *err, const char *test_name) +{ + /* In case of error we bail out and terminate the test program */ + if (err->code == PIDFD_ERROR) + error_report(err, test_name); + + return err->code; +} + +struct child { + pid_t pid; + int fd; +}; + +static struct child clone_newns(int (*fn)(void *), void *args, + struct error *err) +{ + static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD; + size_t stack_size = 1024; + char *stack[1024] = { 0 }; + struct child ret; + + if (!(flags & CLONE_NEWUSER) && geteuid() != 0) + flags |= CLONE_NEWUSER; + +#ifdef __ia64__ + ret.pid = __clone2(fn, stack, stack_size, flags, args, &ret.fd); +#else + ret.pid = clone(fn, stack + stack_size, flags, args, &ret.fd); +#endif + + if (ret.pid < 0) { + error_set(err, PIDFD_ERROR, "clone failed (ret %d, errno %d)", + ret.fd, errno); + return ret; + } + + ksft_print_msg("New child: %d, fd: %d\n", ret.pid, ret.fd); + + return ret; +} + +static inline int child_join(struct child *child, struct error *err) +{ + int r; + + (void)close(child->fd); + r = wait_for_pid(child->pid); + if (r < 0) + error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)", + r, errno); + else if (r > 0) + error_set(err, r, "child %d reported: %d", child->pid, r); + + return r; +} + +static inline void trim_newline(char *str) +{ + char *pos = strrchr(str, '\n'); + + if (pos) + *pos = '\0'; +} + +static int verify_fdinfo_nspid(int pidfd, struct error *err, + const char *expect, ...) +{ + char buffer[512] = {0, }; + char path[512] = {0, }; + va_list args; + FILE *f; + char *line = NULL; + size_t n = 0; + int found = 0; + int r; + + va_start(args, expect); + r = vsnprintf(buffer, sizeof(buffer), expect, args); + assert((size_t)r < sizeof(buffer)); + va_end(args); + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd); + f = fopen(path, "re"); + if (!f) + return error_set(err, PIDFD_ERROR, "fdinfo open failed for %d", + pidfd); + + while (getline(&line, &n, f) != -1) { + if (strncmp(line, "NSpid:", 6)) + continue; + + found = 1; + + r = strcmp(line + 6, buffer); + if (r != 0) { + trim_newline(line); + trim_newline(buffer); + error_set(err, PIDFD_FAIL, "NSpid: '%s' != '%s'", + line + 6, buffer); + } + break; + } + + free(line); + fclose(f); + + if (found == 0) + return error_set(err, PIDFD_FAIL, "NSpid not found for fd %d", + pidfd); + + return PIDFD_PASS; +} + +static int child_fdinfo_nspid_test(void *args) +{ + struct error err; + int pidfd; + int r; + + /* if we got no fd for the sibling, we are done */ + if (!args) + return PIDFD_PASS; + + /* verify that we can not resolve the pidfd for a process + * in a sibling pid namespace, i.e. a pid namespace it is + * not in our or a descended namespace + */ + r = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0); + if (r < 0) { + ksft_print_msg("Failed to remount / private\n"); + return PIDFD_ERROR; + } + + (void)umount2("/proc", MNT_DETACH); + r = mount("proc", "/proc", "proc", 0, NULL); + if (r < 0) { + ksft_print_msg("Failed to remount /proc\n"); + return PIDFD_ERROR; + } + + pidfd = *(int *)args; + r = verify_fdinfo_nspid(pidfd, &err, "\t0\n"); + + if (r != PIDFD_PASS) + ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg); + + return r; +} + +static void test_pidfd_fdinfo_nspid(void) +{ + struct child a, b; + struct error err = {0, }; + const char *test_name = "pidfd check for NSpid in fdinfo"; + + /* Create a new child in a new pid and mount namespace */ + a = clone_newns(child_fdinfo_nspid_test, NULL, &err); + error_check(&err, test_name); + + /* Pass the pidfd representing the first child to the + * second child, which will be in a sibling pid namespace, + * which means that the fdinfo NSpid entry for the pidfd + * should only contain '0'. + */ + b = clone_newns(child_fdinfo_nspid_test, &a.fd, &err); + error_check(&err, test_name); + + /* The children will have pid 1 in the new pid namespace, + * so the line must be 'NSPid:\t\t1'. + */ + verify_fdinfo_nspid(a.fd, &err, "\t%d\t%d\n", a.pid, 1); + verify_fdinfo_nspid(b.fd, &err, "\t%d\t%d\n", b.pid, 1); + + /* wait for the process, check the exit status and set + * 'err' accordingly, if it is not already set. + */ + child_join(&a, &err); + child_join(&b, &err); + + error_report(&err, test_name); +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(1); + + test_pidfd_fdinfo_nspid(); + + return ksft_exit_pass(); +} -- cgit v1.2.3 From 3d6d8da48d0b214d65ea0227d47228abc75d7c88 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:28 +0200 Subject: pidfd: check pid has attached task in fdinfo Currently, when a task is dead we still print the pid it used to use in the fdinfo files of its pidfds. This doesn't make much sense since the pid may have already been reused. So verify that the task is still alive by introducing the pid_has_task() helper which will be used by other callers in follow-up patches. If the task is not alive anymore, we will print -1. This allows us to differentiate between a task not being present in a given pid namespace - in which case we already print 0 - and a task having been reaped. Note that this uses PIDTYPE_PID for the check. Technically, we could've checked PIDTYPE_TGID since pidfds currently only refer to thread-group leaders but if they won't anymore in the future then this check becomes problematic without it being immediately obvious to non-experts imho. If a thread is created via clone(CLONE_THREAD) than struct pid has a single non-empty list pid->tasks[PIDTYPE_PID] and this pid can't be used as a PIDTYPE_TGID meaning pid->tasks[PIDTYPE_TGID] will return NULL even though the thread-group leader might still be very much alive. So checking PIDTYPE_PID is fine and is easier to maintain should we ever allow pidfds to refer to threads. Cc: Jann Horn Cc: Christian Kellner Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-1-christian.brauner@ubuntu.com --- include/linux/pid.h | 4 ++++ kernel/fork.c | 17 +++++++++++------ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 9645b1194c98..034e3cd60dc0 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid) extern void put_pid(struct pid *pid); extern struct task_struct *pid_task(struct pid *pid, enum pid_type); +static inline bool pid_has_task(struct pid *pid, enum pid_type type) +{ + return !hlist_empty(&pid->tasks[type]); +} extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type); extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); diff --git a/kernel/fork.c b/kernel/fork.c index 782986962d47..ffa314838b43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1732,15 +1732,20 @@ static int pidfd_release(struct inode *inode, struct file *file) */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { - struct pid_namespace *ns = proc_pid_ns(file_inode(m->file)); struct pid *pid = f->private_data; - pid_t nr = pid_nr_ns(pid, ns); + struct pid_namespace *ns; + pid_t nr = -1; - seq_put_decimal_ull(m, "Pid:\t", nr); + if (likely(pid_has_task(pid, PIDTYPE_PID))) { + ns = proc_pid_ns(file_inode(m->file)); + nr = pid_nr_ns(pid, ns); + } + + seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS - seq_put_decimal_ull(m, "\nNSpid:\t", nr); - if (nr) { + seq_put_decimal_ll(m, "\nNSpid:\t", nr); + if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that @@ -1749,7 +1754,7 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) - seq_put_decimal_ull(m, "\t", pid->numbers[i].nr); + seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); -- cgit v1.2.3 From 67fc700016b75d6306b65d68edd2e5d76b2b1160 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:29 +0200 Subject: test: verify fdinfo for pidfd of reaped process Test that the fdinfo field of a pidfd referring to a dead process correctly shows Pid: -1 and NSpid: -1. Cc: Christian Kellner Cc: linux-kselftest@vger.kernel.org Reviewed-by: Christian Kellner Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20191017101832.5985-2-christian.brauner@ubuntu.com --- tools/testing/selftests/pidfd/pidfd_fdinfo_test.c | 59 +++++++++++++++++------ 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c index 3721be994abd..22558524f71c 100644 --- a/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c +++ b/tools/testing/selftests/pidfd/pidfd_fdinfo_test.c @@ -113,11 +113,15 @@ static struct child clone_newns(int (*fn)(void *), void *args, return ret; } +static inline void child_close(struct child *child) +{ + close(child->fd); +} + static inline int child_join(struct child *child, struct error *err) { int r; - (void)close(child->fd); r = wait_for_pid(child->pid); if (r < 0) error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)", @@ -128,6 +132,12 @@ static inline int child_join(struct child *child, struct error *err) return r; } +static inline int child_join_close(struct child *child, struct error *err) +{ + child_close(child); + return child_join(child, err); +} + static inline void trim_newline(char *str) { char *pos = strrchr(str, '\n'); @@ -136,8 +146,8 @@ static inline void trim_newline(char *str) *pos = '\0'; } -static int verify_fdinfo_nspid(int pidfd, struct error *err, - const char *expect, ...) +static int verify_fdinfo(int pidfd, struct error *err, const char *prefix, + size_t prefix_len, const char *expect, ...) { char buffer[512] = {0, }; char path[512] = {0, }; @@ -160,17 +170,20 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err, pidfd); while (getline(&line, &n, f) != -1) { - if (strncmp(line, "NSpid:", 6)) + char *val; + + if (strncmp(line, prefix, prefix_len)) continue; found = 1; - r = strcmp(line + 6, buffer); + val = line + prefix_len; + r = strcmp(val, buffer); if (r != 0) { trim_newline(line); trim_newline(buffer); - error_set(err, PIDFD_FAIL, "NSpid: '%s' != '%s'", - line + 6, buffer); + error_set(err, PIDFD_FAIL, "%s '%s' != '%s'", + prefix, val, buffer); } break; } @@ -179,8 +192,8 @@ static int verify_fdinfo_nspid(int pidfd, struct error *err, fclose(f); if (found == 0) - return error_set(err, PIDFD_FAIL, "NSpid not found for fd %d", - pidfd); + return error_set(err, PIDFD_FAIL, "%s not found for fd %d", + prefix, pidfd); return PIDFD_PASS; } @@ -213,7 +226,7 @@ static int child_fdinfo_nspid_test(void *args) } pidfd = *(int *)args; - r = verify_fdinfo_nspid(pidfd, &err, "\t0\n"); + r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n"); if (r != PIDFD_PASS) ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg); @@ -242,24 +255,42 @@ static void test_pidfd_fdinfo_nspid(void) /* The children will have pid 1 in the new pid namespace, * so the line must be 'NSPid:\t\t1'. */ - verify_fdinfo_nspid(a.fd, &err, "\t%d\t%d\n", a.pid, 1); - verify_fdinfo_nspid(b.fd, &err, "\t%d\t%d\n", b.pid, 1); + verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1); + verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1); /* wait for the process, check the exit status and set * 'err' accordingly, if it is not already set. */ + child_join_close(&a, &err); + child_join_close(&b, &err); + + error_report(&err, test_name); +} + +static void test_pidfd_dead_fdinfo(void) +{ + struct child a; + struct error err = {0, }; + const char *test_name = "pidfd check fdinfo for dead process"; + + /* Create a new child in a new pid and mount namespace */ + a = clone_newns(child_fdinfo_nspid_test, NULL, &err); + error_check(&err, test_name); child_join(&a, &err); - child_join(&b, &err); + verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n"); + verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n"); + child_close(&a); error_report(&err, test_name); } int main(int argc, char **argv) { ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(2); test_pidfd_fdinfo_nspid(); + test_pidfd_dead_fdinfo(); return ksft_exit_pass(); } -- cgit v1.2.3 From 1d416a113f0c0da7a3608ffe4d44bd8e9c4859fa Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:30 +0200 Subject: pid: use pid_has_task() in __change_pid() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-3-christian.brauner@ubuntu.com --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/pid.c b/kernel/pid.c index 0a9f2e437217..124d40b239b1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -299,7 +299,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type, *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) - if (!hlist_empty(&pid->tasks[tmp])) + if (pid_has_task(pid, tmp)) return; free_pid(pid); -- cgit v1.2.3 From 1722c14a2097634a7ba37000c0ec7d9409918b64 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:31 +0200 Subject: exit: use pid_has_task() in do_wait() Replace hlist_empty() with the new pid_has_task() helper which is more idiomatic, easier to grep for, and unifies how callers perform this check. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-4-christian.brauner@ubuntu.com --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..f2d20ab74422 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1457,7 +1457,7 @@ repeat: */ wo->notask_error = -ECHILD; if ((wo->wo_type < PIDTYPE_MAX) && - (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + (!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type))) goto notask; set_current_state(TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 1e1d0f0b1a3e3533ea4cd4021eb251e53827c70b Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Thu, 17 Oct 2019 12:18:32 +0200 Subject: pid: use pid_has_task() in pidfd_open() Use the new pid_has_task() helper in pidfd_open(). This simplifies the code and avoids taking rcu_read_{lock,unlock}() and leads to overall nicer code. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191017101832.5985-5-christian.brauner@ubuntu.com --- kernel/pid.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 124d40b239b1..7b5f6c963d72 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -497,7 +497,7 @@ static int pidfd_create(struct pid *pid) */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { - int fd, ret; + int fd; struct pid *p; if (flags) @@ -510,13 +510,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) if (!p) return -ESRCH; - ret = 0; - rcu_read_lock(); - if (!pid_task(p, PIDTYPE_TGID)) - ret = -EINVAL; - rcu_read_unlock(); + if (pid_has_task(p, PIDTYPE_TGID)) + fd = pidfd_create(p); + else + fd = -EINVAL; - fd = ret ?: pidfd_create(p); put_pid(p); return fd; } -- cgit v1.2.3 From b612e5df4587c934bd056bf05f4a1deca4de4f75 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:37 +0200 Subject: clone3: add CLONE_CLEAR_SIGHAND Reset all signal handlers of the child not set to SIG_IGN to SIG_DFL. Mutually exclusive with CLONE_SIGHAND to not disturb other thread's signal handler. In the spirit of closer cooperation between glibc developers and kernel developers (cf. [2]) this patchset came out of a discussion on the glibc mailing list for improving posix_spawn() (cf. [1], [3], [4]). Kernel support for this feature has been explicitly requested by glibc and I see no reason not to help them with this. The child helper process on Linux posix_spawn must ensure that no signal handlers are enabled, so the signal disposition must be either SIG_DFL or SIG_IGN. However, it requires a sigprocmask to obtain the current signal mask and at least _NSIG sigaction calls to reset the signal handlers for each posix_spawn call or complex state tracking that might lead to data corruption in glibc. Adding this flags lets glibc avoid these problems. [1]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00149.html [3]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00158.html [4]: https://www.sourceware.org/ml/libc-alpha/2019-10/msg00160.html [2]: https://lwn.net/Articles/799331/ '[...] by asking for better cooperation with the C-library projects in general. They should be copied on patches containing ABI changes, for example. I noted that there are often times where C-library developers wish the kernel community had done things differently; how could those be avoided in the future? Members of the audience suggested that more glibc developers should perhaps join the linux-api list. The other suggestion was to "copy Florian on everything".' Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20191014104538.3096-1-christian.brauner@ubuntu.com --- include/uapi/linux/sched.h | 3 +++ kernel/fork.c | 16 +++++++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 99335e1f4a27..1d500ed03c63 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -33,6 +33,9 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* Flags for the clone3() syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ + #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall diff --git a/kernel/fork.c b/kernel/fork.c index ffa314838b43..954e875e72b1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) spin_lock_irq(¤t->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(¤t->sighand->siglock); + + /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ + if (clone_flags & CLONE_CLEAR_SIGHAND) + flush_signal_handlers(tsk, 0); + return 0; } @@ -2619,11 +2624,8 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, static bool clone3_args_valid(const struct kernel_clone_args *kargs) { - /* - * All lower bits of the flag word are taken. - * Verify that no other unknown flags are passed along. - */ - if (kargs->flags & ~CLONE_LEGACY_FLAGS) + /* Verify that no unknown flags are passed along. */ + if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) return false; /* @@ -2633,6 +2635,10 @@ static bool clone3_args_valid(const struct kernel_clone_args *kargs) if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) return false; + if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == + (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) + return false; + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; -- cgit v1.2.3 From de5287235631cc561716d85f984614ef9598a5cc Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Oct 2019 12:45:38 +0200 Subject: tests: test CLONE_CLEAR_SIGHAND Test that CLONE_CLEAR_SIGHAND resets signal handlers to SIG_DFL for the child process and that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are mutually exclusive. Cc: Florian Weimer Cc: libc-alpha@sourceware.org Cc: linux-api@vger.kernel.org Signed-off-by: Christian Brauner Link: https://lore.kernel.org/r/20191014104538.3096-2-christian.brauner@ubuntu.com --- MAINTAINERS | 1 + tools/testing/selftests/Makefile | 1 + tools/testing/selftests/clone3/.gitignore | 1 + tools/testing/selftests/clone3/Makefile | 6 + .../selftests/clone3/clone3_clear_sighand.c | 172 +++++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 tools/testing/selftests/clone3/.gitignore create mode 100644 tools/testing/selftests/clone3/Makefile create mode 100644 tools/testing/selftests/clone3/clone3_clear_sighand.c diff --git a/MAINTAINERS b/MAINTAINERS index a69e6db80c79..ab09c34b8a1e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12826,6 +12826,7 @@ S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git F: samples/pidfd/ F: tools/testing/selftests/pidfd/ +F: tools/testing/selftests/clone3/ K: (?i)pidfd K: (?i)clone3 K: \b(clone_args|kernel_clone_args)\b diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 4cdbae6f4e61..ad442364218a 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -4,6 +4,7 @@ TARGETS += bpf TARGETS += breakpoints TARGETS += capabilities TARGETS += cgroup +TARGETS += clone3 TARGETS += cpufreq TARGETS += cpu-hotplug TARGETS += drivers/dma-buf diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore new file mode 100644 index 000000000000..6c9f98097774 --- /dev/null +++ b/tools/testing/selftests/clone3/.gitignore @@ -0,0 +1 @@ +clone3_clear_sighand diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile new file mode 100644 index 000000000000..e6f259321e16 --- /dev/null +++ b/tools/testing/selftests/clone3/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0-only +CFLAGS += -g -I../../../../usr/include/ + +TEST_GEN_PROGS := clone3_clear_sighand + +include ../lib.mk diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c new file mode 100644 index 000000000000..0d957be1bdc5 --- /dev/null +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -0,0 +1,172 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#ifndef CLONE_CLEAR_SIGHAND +#define CLONE_CLEAR_SIGHAND 0x100000000ULL +#endif + +#ifndef __NR_clone3 +#define __NR_clone3 -1 +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; +}; +#endif + +static pid_t sys_clone3(struct clone_args *args, size_t size) +{ + return syscall(__NR_clone3, args, size); +} + +static void test_clone3_supported(void) +{ + pid_t pid; + struct clone_args args = {}; + + if (__NR_clone3 < 0) + ksft_exit_skip("clone3() syscall is not supported\n"); + + /* Set to something that will always cause EINVAL. */ + args.exit_signal = -1; + pid = sys_clone3(&args, sizeof(args)); + if (!pid) + exit(EXIT_SUCCESS); + + if (pid > 0) { + wait(NULL); + ksft_exit_fail_msg( + "Managed to create child process with invalid exit_signal\n"); + } + + if (errno == ENOSYS) + ksft_exit_skip("clone3() syscall is not supported\n"); + + ksft_print_msg("clone3() syscall supported\n"); +} + +static void nop_handler(int signo) +{ +} + +static int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, &status, 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static void test_clone3_clear_sighand(void) +{ + int ret; + pid_t pid; + struct clone_args args = {}; + struct sigaction act; + + /* + * Check that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are mutually + * exclusive. + */ + args.flags |= CLONE_CLEAR_SIGHAND | CLONE_SIGHAND; + args.exit_signal = SIGCHLD; + pid = sys_clone3(&args, sizeof(args)); + if (pid > 0) + ksft_exit_fail_msg( + "clone3(CLONE_CLEAR_SIGHAND | CLONE_SIGHAND) succeeded\n"); + + act.sa_handler = nop_handler; + ret = sigemptyset(&act.sa_mask); + if (ret < 0) + ksft_exit_fail_msg("%s - sigemptyset() failed\n", + strerror(errno)); + + act.sa_flags = 0; + + /* Register signal handler for SIGUSR1 */ + ret = sigaction(SIGUSR1, &act, NULL); + if (ret < 0) + ksft_exit_fail_msg( + "%s - sigaction(SIGUSR1, &act, NULL) failed\n", + strerror(errno)); + + /* Register signal handler for SIGUSR2 */ + ret = sigaction(SIGUSR2, &act, NULL); + if (ret < 0) + ksft_exit_fail_msg( + "%s - sigaction(SIGUSR2, &act, NULL) failed\n", + strerror(errno)); + + /* Check that CLONE_CLEAR_SIGHAND works. */ + args.flags = CLONE_CLEAR_SIGHAND; + pid = sys_clone3(&args, sizeof(args)); + if (pid < 0) + ksft_exit_fail_msg("%s - clone3(CLONE_CLEAR_SIGHAND) failed\n", + strerror(errno)); + + if (pid == 0) { + ret = sigaction(SIGUSR1, NULL, &act); + if (ret < 0) + exit(EXIT_FAILURE); + + if (act.sa_handler != SIG_DFL) + exit(EXIT_FAILURE); + + ret = sigaction(SIGUSR2, NULL, &act); + if (ret < 0) + exit(EXIT_FAILURE); + + if (act.sa_handler != SIG_DFL) + exit(EXIT_FAILURE); + + exit(EXIT_SUCCESS); + } + + ret = wait_for_pid(pid); + if (ret) + ksft_exit_fail_msg( + "Failed to clear signal handler for child process\n"); + + ksft_test_result_pass("Cleared signal handlers for child process\n"); +} + +int main(int argc, char **argv) +{ + ksft_print_header(); + ksft_set_plan(1); + + test_clone3_supported(); + test_clone3_clear_sighand(); + + return ksft_exit_pass(); +} -- cgit v1.2.3 From 17a810699c189cb8f2f0ba21c7f83396599bea26 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Tue, 12 Nov 2019 10:58:51 +0100 Subject: selftests: add tests for clone3() This adds tests for clone3() with different values and sizes of struct clone_args. This selftest was initially part of of the clone3() with PID selftest. After that patch was almost merged Eugene sent out a couple of patches to fix problems with these test. This commit now only contains the clone3() selftest after the LPC decision to rework clone3() with PID to allow setting the PID in multiple PID namespaces including all of Eugene's patches. Signed-off-by: Eugene Syromiatnikov Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Link: https://lore.kernel.org/r/20191112095851.811884-1-areber@redhat.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/.gitignore | 1 + tools/testing/selftests/clone3/Makefile | 4 +- tools/testing/selftests/clone3/clone3.c | 205 ++++++++++++++++++++++++++++++ 3 files changed, 208 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/clone3/clone3.c diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore index 6c9f98097774..2a30ae18b06e 100644 --- a/tools/testing/selftests/clone3/.gitignore +++ b/tools/testing/selftests/clone3/.gitignore @@ -1 +1,2 @@ +clone3 clone3_clear_sighand diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile index e6f259321e16..eb26eb793c80 100644 --- a/tools/testing/selftests/clone3/Makefile +++ b/tools/testing/selftests/clone3/Makefile @@ -1,6 +1,6 @@ -# SPDX-License-Identifier: GPL-2.0-only +# SPDX-License-Identifier: GPL-2.0 CFLAGS += -g -I../../../../usr/include/ -TEST_GEN_PROGS := clone3_clear_sighand +TEST_GEN_PROGS := clone3 clone3_clear_sighand include ../lib.mk diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c new file mode 100644 index 000000000000..0f8a9ef40117 --- /dev/null +++ b/tools/testing/selftests/clone3/clone3.c @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Based on Christian Brauner's clone3() example */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +/* + * Different sizes of struct clone_args + */ +#ifndef CLONE3_ARGS_SIZE_V0 +#define CLONE3_ARGS_SIZE_V0 64 +#endif + +enum test_mode { + CLONE3_ARGS_NO_TEST, + CLONE3_ARGS_ALL_0, + CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG, + CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG, + CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG, + CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG, +}; + +static pid_t raw_clone(struct clone_args *args, size_t size) +{ + return syscall(__NR_clone3, args, size); +} + +static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) +{ + struct clone_args args = { + .flags = flags, + .exit_signal = SIGCHLD, + }; + + struct clone_args_extended { + struct clone_args args; + __aligned_u64 excess_space[2]; + } args_ext; + + pid_t pid = -1; + int status; + + memset(&args_ext, 0, sizeof(args_ext)); + if (size > sizeof(struct clone_args)) + args_ext.excess_space[1] = 1; + + if (size == 0) + size = sizeof(struct clone_args); + + switch (test_mode) { + case CLONE3_ARGS_ALL_0: + args.flags = 0; + args.exit_signal = 0; + break; + case CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG: + args.exit_signal = 0xbadc0ded00000000ULL; + break; + case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG: + args.exit_signal = 0x0000000080000000ULL; + break; + case CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG: + args.exit_signal = 0x0000000000000100ULL; + break; + case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG: + args.exit_signal = 0x00000000000000f0ULL; + break; + } + + memcpy(&args_ext.args, &args, sizeof(struct clone_args)); + + pid = raw_clone((struct clone_args *)&args_ext, size); + if (pid < 0) { + ksft_print_msg("%s - Failed to create new process\n", + strerror(errno)); + return -errno; + } + + if (pid == 0) { + ksft_print_msg("I am the child, my PID is %d\n", getpid()); + _exit(EXIT_SUCCESS); + } + + ksft_print_msg("I am the parent (%d). My child's pid is %d\n", + getpid(), pid); + + if (waitpid(-1, &status, __WALL) < 0) { + ksft_print_msg("Child returned %s\n", strerror(errno)); + return -errno; + } + if (WEXITSTATUS(status)) + return WEXITSTATUS(status); + + return 0; +} + +static void test_clone3(uint64_t flags, size_t size, int expected, + enum test_mode test_mode) +{ + int ret; + + ksft_print_msg( + "[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)\n", + getpid(), flags, size); + ret = call_clone3(flags, size, test_mode); + ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n", + getpid(), ret, expected); + if (ret != expected) + ksft_test_result_fail( + "[%d] Result (%d) is different than expected (%d)\n", + getpid(), ret, expected); + else + ksft_test_result_pass( + "[%d] Result (%d) matches expectation (%d)\n", + getpid(), ret, expected); +} + +int main(int argc, char *argv[]) +{ + pid_t pid; + + uid_t uid = getuid(); + + ksft_print_header(); + ksft_set_plan(17); + + /* Just a simple clone3() should return 0.*/ + test_clone3(0, 0, 0, CLONE3_ARGS_NO_TEST); + + /* Do a clone3() in a new PID NS.*/ + if (uid == 0) + test_clone3(CLONE_NEWPID, 0, 0, CLONE3_ARGS_NO_TEST); + else + ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); + + /* Do a clone3() with CLONE3_ARGS_SIZE_V0. */ + test_clone3(0, CLONE3_ARGS_SIZE_V0, 0, CLONE3_ARGS_NO_TEST); + + /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 */ + test_clone3(0, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST); + + /* Do a clone3() with sizeof(struct clone_args) + 8 */ + test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_NO_TEST); + + /* Do a clone3() with exit_signal having highest 32 bits non-zero */ + test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG); + + /* Do a clone3() with negative 32-bit exit_signal */ + test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG); + + /* Do a clone3() with exit_signal not fitting into CSIGNAL mask */ + test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG); + + /* Do a clone3() with NSIG < exit_signal < CSIG */ + test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG); + + test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_ALL_0); + + test_clone3(0, sizeof(struct clone_args) + 16, -E2BIG, + CLONE3_ARGS_ALL_0); + + test_clone3(0, sizeof(struct clone_args) * 2, -E2BIG, + CLONE3_ARGS_ALL_0); + + /* Do a clone3() with > page size */ + test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST); + + /* Do a clone3() with CLONE3_ARGS_SIZE_V0 in a new PID NS. */ + if (uid == 0) + test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0, 0, + CLONE3_ARGS_NO_TEST); + else + ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); + + /* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 in a new PID NS */ + test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, + CLONE3_ARGS_NO_TEST); + + /* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */ + if (uid == 0) + test_clone3(CLONE_NEWPID, sizeof(struct clone_args) + 8, 0, + CLONE3_ARGS_NO_TEST); + else + ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n"); + + /* Do a clone3() with > page size in a new PID NS */ + test_clone3(CLONE_NEWPID, getpagesize() + 8, -E2BIG, + CLONE3_ARGS_NO_TEST); + + return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail(); +} -- cgit v1.2.3 From 49cb2fc42ce4b7a656ee605e30c302efaa39c1a7 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:20 +0100 Subject: fork: extend clone3() to support setting a PID The main motivation to add set_tid to clone3() is CRIU. To restore a process with the same PID/TID CRIU currently uses /proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to ns_last_pid and then (quickly) does a clone(). This works most of the time, but it is racy. It is also slow as it requires multiple syscalls. Extending clone3() to support *set_tid makes it possible restore a process using CRIU without accessing /proc/sys/kernel/ns_last_pid and race free (as long as the desired PID/TID is available). This clone3() extension places the same restrictions (CAP_SYS_ADMIN) on clone3() with *set_tid as they are currently in place for ns_last_pid. The original version of this change was using a single value for set_tid. At the 2019 LPC, after presenting set_tid, it was, however, decided to change set_tid to an array to enable setting the PID of a process in multiple PID namespaces at the same time. If a process is created in a PID namespace it is possible to influence the PID inside and outside of the PID namespace. Details also in the corresponding selftest. To create a process with the following PIDs: PID NS level Requested PID 0 (host) 31496 1 42 2 1 For that example the two newly introduced parameters to struct clone_args (set_tid and set_tid_size) would need to be: set_tid[0] = 1; set_tid[1] = 42; set_tid[2] = 31496; set_tid_size = 3; If only the PIDs of the two innermost nested PID namespaces should be defined it would look like this: set_tid[0] = 1; set_tid[1] = 42; set_tid_size = 2; The PID of the newly created process would then be the next available free PID in the PID namespace level 0 (host) and 42 in the PID namespace at level 1 and the PID of the process in the innermost PID namespace would be 1. The set_tid array is used to specify the PID of a process starting from the innermost nested PID namespaces up to set_tid_size PID namespaces. set_tid_size cannot be larger then the current PID namespace level. Signed-off-by: Adrian Reber Reviewed-by: Christian Brauner Reviewed-by: Oleg Nesterov Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com> Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com Signed-off-by: Christian Brauner --- include/linux/pid.h | 3 +- include/linux/pid_namespace.h | 2 ++ include/linux/sched/task.h | 3 ++ include/uapi/linux/sched.h | 53 ++++++++++++++++++++----------- kernel/fork.c | 24 ++++++++++++++- kernel/pid.c | 72 ++++++++++++++++++++++++++++++++++--------- kernel/pid_namespace.c | 2 -- 7 files changed, 122 insertions(+), 37 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 034e3cd60dc0..998ae7d24450 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr); extern struct pid *find_get_pid(int nr); extern struct pid *find_ge_pid(int nr, struct pid_namespace *); -extern struct pid *alloc_pid(struct pid_namespace *ns); +extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size); extern void free_pid(struct pid *pid); extern void disable_pid_allocation(struct pid_namespace *ns); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 49538b172483..2ed6af88794b 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -12,6 +12,8 @@ #include #include +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 struct fs_pin; diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 4b1c3b664f51..f1879884238e 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -26,6 +26,9 @@ struct kernel_clone_args { unsigned long stack; unsigned long stack_size; unsigned long tls; + pid_t *set_tid; + /* Number of elements in *set_tid */ + size_t set_tid_size; }; /* diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 1d500ed03c63..a0b1c224c72b 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -39,24 +39,38 @@ #ifndef __ASSEMBLY__ /** * struct clone_args - arguments for the clone3 syscall - * @flags: Flags for the new process as listed above. - * All flags are valid except for CSIGNAL and - * CLONE_DETACHED. - * @pidfd: If CLONE_PIDFD is set, a pidfd will be - * returned in this argument. - * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the - * child process will be returned in the child's - * memory. - * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of - * the child process will be returned in the - * parent's memory. - * @exit_signal: The exit_signal the parent process will be - * sent when the child exits. - * @stack: Specify the location of the stack for the - * child process. - * @stack_size: The size of the stack for the child process. - * @tls: If CLONE_SETTLS is set, the tls descriptor - * is set to tls. + * @flags: Flags for the new process as listed above. + * All flags are valid except for CSIGNAL and + * CLONE_DETACHED. + * @pidfd: If CLONE_PIDFD is set, a pidfd will be + * returned in this argument. + * @child_tid: If CLONE_CHILD_SETTID is set, the TID of the + * child process will be returned in the child's + * memory. + * @parent_tid: If CLONE_PARENT_SETTID is set, the TID of + * the child process will be returned in the + * parent's memory. + * @exit_signal: The exit_signal the parent process will be + * sent when the child exits. + * @stack: Specify the location of the stack for the + * child process. + * @stack_size: The size of the stack for the child process. + * @tls: If CLONE_SETTLS is set, the tls descriptor + * is set to tls. + * @set_tid: Pointer to an array of type *pid_t. The size + * of the array is defined using @set_tid_size. + * This array is used to select PIDs/TIDs for + * newly created processes. The first element in + * this defines the PID in the most nested PID + * namespace. Each additional element in the array + * defines the PID in the parent PID namespace of + * the original PID namespace. If the array has + * less entries than the number of currently + * nested PID namespaces only the PIDs in the + * corresponding namespaces are set. + * @set_tid_size: This defines the size of the array referenced + * in @set_tid. This cannot be larger than the + * kernel's limit of nested PID namespaces. * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and @@ -71,10 +85,13 @@ struct clone_args { __aligned_u64 stack; __aligned_u64 stack_size; __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; }; #endif #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ /* * Scheduling policies diff --git a/kernel/fork.c b/kernel/fork.c index 954e875e72b1..417570263f1f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2087,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process( stackleak_task_init(p); if (pid != &init_struct_pid) { - pid = alloc_pid(p->nsproxy->pid_ns_for_children); + pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, + args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; @@ -2590,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, { int err; struct clone_args args; + pid_t *kset_tid = kargs->set_tid; if (unlikely(usize > PAGE_SIZE)) return -E2BIG; @@ -2600,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, if (err) return err; + if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + return -EINVAL; + + if (unlikely(!args.set_tid && args.set_tid_size > 0)) + return -EINVAL; + + if (unlikely(args.set_tid && args.set_tid_size == 0)) + return -EINVAL; + /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal @@ -2617,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, + .set_tid_size = args.set_tid_size, }; + if (args.set_tid && + copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + (kargs->set_tid_size * sizeof(pid_t)))) + return -EFAULT; + + kargs->set_tid = kset_tid; + return 0; } @@ -2662,6 +2681,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) int err; struct kernel_clone_args kargs; + pid_t set_tid[MAX_PID_NS_LEVEL]; + + kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) diff --git a/kernel/pid.c b/kernel/pid.c index 7b5f6c963d72..2278e249141d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -157,7 +157,8 @@ void free_pid(struct pid *pid) call_rcu(&pid->rcu, delayed_put_pid); } -struct pid *alloc_pid(struct pid_namespace *ns) +struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, + size_t set_tid_size) { struct pid *pid; enum pid_type type; @@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns) struct upid *upid; int retval = -ENOMEM; + /* + * set_tid_size contains the size of the set_tid array. Starting at + * the most nested currently active PID namespace it tells alloc_pid() + * which PID to set for a process in that most nested PID namespace + * up to set_tid_size PID namespaces. It does not have to set the PID + * for a process in all nested PID namespaces but set_tid_size must + * never be greater than the current ns->level + 1. + */ + if (set_tid_size > ns->level + 1) + return ERR_PTR(-EINVAL); + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); @@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns) pid->level = ns->level; for (i = ns->level; i >= 0; i--) { - int pid_min = 1; + int tid = 0; + + if (set_tid_size) { + tid = set_tid[ns->level - i]; + + retval = -EINVAL; + if (tid < 1 || tid >= pid_max) + goto out_free; + /* + * Also fail if a PID != 1 is requested and + * no PID 1 exists. + */ + if (tid != 1 && !tmp->child_reaper) + goto out_free; + retval = -EPERM; + if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN)) + goto out_free; + set_tid_size--; + } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); - /* - * init really needs pid 1, but after reaching the maximum - * wrap back to RESERVED_PIDS - */ - if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) - pid_min = RESERVED_PIDS; - - /* - * Store a null pointer so find_pid_ns does not find - * a partially initialized PID (see below). - */ - nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, - pid_max, GFP_ATOMIC); + if (tid) { + nr = idr_alloc(&tmp->idr, NULL, tid, + tid + 1, GFP_ATOMIC); + /* + * If ENOSPC is returned it means that the PID is + * alreay in use. Return EEXIST in that case. + */ + if (nr == -ENOSPC) + nr = -EEXIST; + } else { + int pid_min = 1; + /* + * init really needs pid 1, but after reaching the + * maximum wrap back to RESERVED_PIDS + */ + if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) + pid_min = RESERVED_PIDS; + + /* + * Store a null pointer so find_pid_ns does not find + * a partially initialized PID (see below). + */ + nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, + pid_max, GFP_ATOMIC); + } spin_unlock_irq(&pidmap_lock); idr_preload_end(); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a6a79f85c81a..d40017e79ebe 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -26,8 +26,6 @@ static DEFINE_MUTEX(pid_caches_mutex); static struct kmem_cache *pid_ns_cachep; -/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ -#define MAX_PID_NS_LEVEL 32 /* Write once array, filled from the beginning. */ static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL]; -- cgit v1.2.3 From 41585bbeeef9402d5d65687747e04246ef4a3a41 Mon Sep 17 00:00:00 2001 From: Adrian Reber Date: Fri, 15 Nov 2019 13:36:21 +0100 Subject: selftests: add tests for clone3() with *set_tid This tests clone3() with *set_tid to see if all desired PIDs are working as expected. The tests are trying multiple invalid input parameters as well as creating processes while specifying a certain PID in multiple PID namespaces at the same time. Additionally this moves common clone3() test code into clone3_selftests.h. Signed-off-by: Adrian Reber Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20191115123621.142252-2-areber@redhat.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/.gitignore | 1 + tools/testing/selftests/clone3/Makefile | 2 +- tools/testing/selftests/clone3/clone3.c | 8 +- .../selftests/clone3/clone3_clear_sighand.c | 20 +- tools/testing/selftests/clone3/clone3_selftests.h | 35 ++ tools/testing/selftests/clone3/clone3_set_tid.c | 381 +++++++++++++++++++++ 6 files changed, 421 insertions(+), 26 deletions(-) create mode 100644 tools/testing/selftests/clone3/clone3_selftests.h create mode 100644 tools/testing/selftests/clone3/clone3_set_tid.c diff --git a/tools/testing/selftests/clone3/.gitignore b/tools/testing/selftests/clone3/.gitignore index 2a30ae18b06e..0dc4f32c6cb8 100644 --- a/tools/testing/selftests/clone3/.gitignore +++ b/tools/testing/selftests/clone3/.gitignore @@ -1,2 +1,3 @@ clone3 clone3_clear_sighand +clone3_set_tid diff --git a/tools/testing/selftests/clone3/Makefile b/tools/testing/selftests/clone3/Makefile index eb26eb793c80..cf976c732906 100644 --- a/tools/testing/selftests/clone3/Makefile +++ b/tools/testing/selftests/clone3/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 CFLAGS += -g -I../../../../usr/include/ -TEST_GEN_PROGS := clone3 clone3_clear_sighand +TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid include ../lib.mk diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 0f8a9ef40117..4669b3d418e7 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -18,6 +18,7 @@ #include #include "../kselftest.h" +#include "clone3_selftests.h" /* * Different sizes of struct clone_args @@ -35,11 +36,6 @@ enum test_mode { CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG, }; -static pid_t raw_clone(struct clone_args *args, size_t size) -{ - return syscall(__NR_clone3, args, size); -} - static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) { struct clone_args args = { @@ -83,7 +79,7 @@ static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode) memcpy(&args_ext.args, &args, sizeof(struct clone_args)); - pid = raw_clone((struct clone_args *)&args_ext, size); + pid = sys_clone3((struct clone_args *)&args_ext, size); if (pid < 0) { ksft_print_msg("%s - Failed to create new process\n", strerror(errno)); diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index 0d957be1bdc5..456783ad19d6 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -14,30 +14,12 @@ #include #include "../kselftest.h" +#include "clone3_selftests.h" #ifndef CLONE_CLEAR_SIGHAND #define CLONE_CLEAR_SIGHAND 0x100000000ULL #endif -#ifndef __NR_clone3 -#define __NR_clone3 -1 -struct clone_args { - __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; - __aligned_u64 exit_signal; - __aligned_u64 stack; - __aligned_u64 stack_size; - __aligned_u64 tls; -}; -#endif - -static pid_t sys_clone3(struct clone_args *args, size_t size) -{ - return syscall(__NR_clone3, args, size); -} - static void test_clone3_supported(void) { pid_t pid; diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h new file mode 100644 index 000000000000..1a270390766a --- /dev/null +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _CLONE3_SELFTESTS_H +#define _CLONE3_SELFTESTS_H + +#define _GNU_SOURCE +#include +#include +#include +#include + +#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) + +#ifndef __NR_clone3 +#define __NR_clone3 -1 +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; +}; +#endif + +static pid_t sys_clone3(struct clone_args *args, size_t size) +{ + return syscall(__NR_clone3, args, size); +} + +#endif /* _CLONE3_SELFTESTS_H */ diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c new file mode 100644 index 000000000000..3480e1c46983 --- /dev/null +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -0,0 +1,381 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Based on Christian Brauner's clone3() example. + * These tests are assuming to be running in the host's + * PID namespace. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" +#include "clone3_selftests.h" + +#ifndef MAX_PID_NS_LEVEL +#define MAX_PID_NS_LEVEL 32 +#endif + +static int pipe_1[2]; +static int pipe_2[2]; + +static int call_clone3_set_tid(pid_t *set_tid, + size_t set_tid_size, + int flags, + int expected_pid, + bool wait_for_it) +{ + int status; + pid_t pid = -1; + + struct clone_args args = { + .flags = flags, + .exit_signal = SIGCHLD, + .set_tid = ptr_to_u64(set_tid), + .set_tid_size = set_tid_size, + }; + + pid = sys_clone3(&args, sizeof(struct clone_args)); + if (pid < 0) { + ksft_print_msg("%s - Failed to create new process\n", + strerror(errno)); + return -errno; + } + + if (pid == 0) { + int ret; + char tmp = 0; + int exit_code = EXIT_SUCCESS; + + ksft_print_msg("I am the child, my PID is %d (expected %d)\n", + getpid(), set_tid[0]); + if (wait_for_it) { + ksft_print_msg("[%d] Child is ready and waiting\n", + getpid()); + + /* Signal the parent that the child is ready */ + close(pipe_1[0]); + ret = write(pipe_1[1], &tmp, 1); + if (ret != 1) { + ksft_print_msg( + "Writing to pipe returned %d", ret); + exit_code = EXIT_FAILURE; + } + close(pipe_1[1]); + close(pipe_2[1]); + ret = read(pipe_2[0], &tmp, 1); + if (ret != 1) { + ksft_print_msg( + "Reading from pipe returned %d", ret); + exit_code = EXIT_FAILURE; + } + close(pipe_2[0]); + } + + if (set_tid[0] != getpid()) + _exit(EXIT_FAILURE); + _exit(exit_code); + } + + if (expected_pid == 0 || expected_pid == pid) { + ksft_print_msg("I am the parent (%d). My child's pid is %d\n", + getpid(), pid); + } else { + ksft_print_msg( + "Expected child pid %d does not match actual pid %d\n", + expected_pid, pid); + return -1; + } + + if (waitpid(pid, &status, 0) < 0) { + ksft_print_msg("Child returned %s\n", strerror(errno)); + return -errno; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + +static void test_clone3_set_tid(pid_t *set_tid, + size_t set_tid_size, + int flags, + int expected, + int expected_pid, + bool wait_for_it) +{ + int ret; + + ksft_print_msg( + "[%d] Trying clone3() with CLONE_SET_TID to %d and 0x%x\n", + getpid(), set_tid[0], flags); + ret = call_clone3_set_tid(set_tid, set_tid_size, flags, expected_pid, + wait_for_it); + ksft_print_msg( + "[%d] clone3() with CLONE_SET_TID %d says :%d - expected %d\n", + getpid(), set_tid[0], ret, expected); + if (ret != expected) + ksft_test_result_fail( + "[%d] Result (%d) is different than expected (%d)\n", + getpid(), ret, expected); + else + ksft_test_result_pass( + "[%d] Result (%d) matches expectation (%d)\n", + getpid(), ret, expected); +} +int main(int argc, char *argv[]) +{ + FILE *f; + char buf; + char *line; + int status; + int ret = -1; + size_t len = 0; + int pid_max = 0; + uid_t uid = getuid(); + char proc_path[100] = {0}; + pid_t pid, ns1, ns2, ns3, ns_pid; + pid_t set_tid[MAX_PID_NS_LEVEL * 2]; + + if (pipe(pipe_1) < 0 || pipe(pipe_2) < 0) + ksft_exit_fail_msg("pipe() failed\n"); + + ksft_print_header(); + ksft_set_plan(27); + + f = fopen("/proc/sys/kernel/pid_max", "r"); + if (f == NULL) + ksft_exit_fail_msg( + "%s - Could not open /proc/sys/kernel/pid_max\n", + strerror(errno)); + fscanf(f, "%d", &pid_max); + fclose(f); + ksft_print_msg("/proc/sys/kernel/pid_max %d\n", pid_max); + + /* Try invalid settings */ + memset(&set_tid, 0, sizeof(set_tid)); + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + + /* + * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 + * nested PID namespace. + */ + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + + memset(&set_tid, 0xff, sizeof(set_tid)); + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL + 1, 0, -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2, 0, -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 2 + 1, 0, + -EINVAL, 0, 0); + + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL * 42, 0, -EINVAL, 0, 0); + + /* + * This can actually work if this test running in a MAX_PID_NS_LEVEL - 1 + * nested PID namespace. + */ + test_clone3_set_tid(set_tid, MAX_PID_NS_LEVEL - 1, 0, -EINVAL, 0, 0); + + memset(&set_tid, 0, sizeof(set_tid)); + /* Try with an invalid PID */ + set_tid[0] = 0; + test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + + set_tid[0] = -1; + test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + + /* Claim that the set_tid array actually contains 2 elements. */ + test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + + /* Try it in a new PID namespace */ + if (uid == 0) + test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + else + ksft_test_result_skip("Clone3() with set_tid requires root\n"); + + /* Try with a valid PID (1) this should return -EEXIST. */ + set_tid[0] = 1; + if (uid == 0) + test_clone3_set_tid(set_tid, 1, 0, -EEXIST, 0, 0); + else + ksft_test_result_skip("Clone3() with set_tid requires root\n"); + + /* Try it in a new PID namespace */ + if (uid == 0) + test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, 0, 0, 0); + else + ksft_test_result_skip("Clone3() with set_tid requires root\n"); + + /* pid_max should fail everywhere */ + set_tid[0] = pid_max; + test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + + if (uid == 0) + test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + else + ksft_test_result_skip("Clone3() with set_tid requires root\n"); + + if (uid != 0) { + /* + * All remaining tests require root. Tell the framework + * that all those tests are skipped as non-root. + */ + ksft_cnt.ksft_xskip += ksft_plan - ksft_test_num(); + goto out; + } + + /* Find the current active PID */ + pid = fork(); + if (pid == 0) { + ksft_print_msg("Child has PID %d\n", getpid()); + _exit(EXIT_SUCCESS); + } + if (waitpid(pid, &status, 0) < 0) + ksft_exit_fail_msg("Waiting for child %d failed", pid); + + /* After the child has finished, its PID should be free. */ + set_tid[0] = pid; + test_clone3_set_tid(set_tid, 1, 0, 0, 0, 0); + + /* This should fail as there is no PID 1 in that namespace */ + test_clone3_set_tid(set_tid, 1, CLONE_NEWPID, -EINVAL, 0, 0); + + /* + * Creating a process with PID 1 in the newly created most nested + * PID namespace and PID 'pid' in the parent PID namespace. This + * needs to work. + */ + set_tid[0] = 1; + set_tid[1] = pid; + test_clone3_set_tid(set_tid, 2, CLONE_NEWPID, 0, pid, 0); + + ksft_print_msg("unshare PID namespace\n"); + if (unshare(CLONE_NEWPID) == -1) + ksft_exit_fail_msg("unshare(CLONE_NEWPID) failed: %s\n", + strerror(errno)); + + set_tid[0] = pid; + + /* This should fail as there is no PID 1 in that namespace */ + test_clone3_set_tid(set_tid, 1, 0, -EINVAL, 0, 0); + + /* Let's create a PID 1 */ + ns_pid = fork(); + if (ns_pid == 0) { + ksft_print_msg("Child in PID namespace has PID %d\n", getpid()); + set_tid[0] = 2; + test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0); + + set_tid[0] = 1; + set_tid[1] = -1; + set_tid[2] = pid; + /* This should fail as there is invalid PID at level '1'. */ + test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, -EINVAL, 0, 0); + + set_tid[0] = 1; + set_tid[1] = 42; + set_tid[2] = pid; + /* + * This should fail as there are not enough active PID + * namespaces. Again assuming this is running in the host's + * PID namespace. Not yet nested. + */ + test_clone3_set_tid(set_tid, 4, CLONE_NEWPID, -EINVAL, 0, 0); + + /* + * This should work and from the parent we should see + * something like 'NSpid: pid 42 1'. + */ + test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); + + _exit(ksft_cnt.ksft_pass); + } + + close(pipe_1[1]); + close(pipe_2[0]); + while (read(pipe_1[0], &buf, 1) > 0) { + ksft_print_msg("[%d] Child is ready and waiting\n", getpid()); + break; + } + + snprintf(proc_path, sizeof(proc_path), "/proc/%d/status", pid); + f = fopen(proc_path, "r"); + if (f == NULL) + ksft_exit_fail_msg( + "%s - Could not open %s\n", + strerror(errno), proc_path); + + while (getline(&line, &len, f) != -1) { + if (strstr(line, "NSpid")) { + int i; + + /* Verify that all generated PIDs are as expected. */ + i = sscanf(line, "NSpid:\t%d\t%d\t%d", + &ns3, &ns2, &ns1); + if (i != 3) { + ksft_print_msg( + "Unexpected 'NSPid:' entry: %s", + line); + ns1 = ns2 = ns3 = 0; + } + break; + } + } + fclose(f); + free(line); + close(pipe_2[0]); + + /* Tell the clone3()'d child to finish. */ + write(pipe_2[1], &buf, 1); + close(pipe_2[1]); + + if (waitpid(ns_pid, &status, 0) < 0) { + ksft_print_msg("Child returned %s\n", strerror(errno)); + ret = -errno; + goto out; + } + + if (!WIFEXITED(status)) + ksft_test_result_fail("Child error\n"); + + if (WEXITSTATUS(status)) + /* + * Update the number of total tests with the tests from the + * child processes. + */ + ksft_cnt.ksft_pass = WEXITSTATUS(status); + + if (ns3 == pid && ns2 == 42 && ns1 == 1) + ksft_test_result_pass( + "PIDs in all namespaces as expected (%d,%d,%d)\n", + ns3, ns2, ns1); + else + ksft_test_result_fail( + "PIDs in all namespaces not as expected (%d,%d,%d)\n", + ns3, ns2, ns1); +out: + ret = 0; + + return !ret ? ksft_exit_pass() : ksft_exit_fail(); +} -- cgit v1.2.3 From 4f5c289ea66a33457d51b305d4d77e1ca4c8ff17 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 17 Nov 2019 22:47:48 -0800 Subject: selftests/clone3: flush stdout and stderr before clone3() and _exit() Buffers have to be flushed before clone3() to avoid double messages in the log. Fixes: 41585bbeeef9 ("selftests: add tests for clone3() with *set_tid") Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20191118064750.408003-1-avagin@gmail.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/clone3_selftests.h | 2 ++ tools/testing/selftests/clone3/clone3_set_tid.c | 15 +++++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index 1a270390766a..0e3dea58855f 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -29,6 +29,8 @@ struct clone_args { static pid_t sys_clone3(struct clone_args *args, size_t size) { + fflush(stdout); + fflush(stderr); return syscall(__NR_clone3, args, size); } diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index 3480e1c46983..e93369dcfe3b 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -30,6 +30,13 @@ static int pipe_1[2]; static int pipe_2[2]; +static void child_exit(int ret) +{ + fflush(stdout); + fflush(stderr); + _exit(ret); +} + static int call_clone3_set_tid(pid_t *set_tid, size_t set_tid_size, int flags, @@ -84,8 +91,8 @@ static int call_clone3_set_tid(pid_t *set_tid, } if (set_tid[0] != getpid()) - _exit(EXIT_FAILURE); - _exit(exit_code); + child_exit(EXIT_FAILURE); + child_exit(exit_code); } if (expected_pid == 0 || expected_pid == pid) { @@ -249,7 +256,7 @@ int main(int argc, char *argv[]) pid = fork(); if (pid == 0) { ksft_print_msg("Child has PID %d\n", getpid()); - _exit(EXIT_SUCCESS); + child_exit(EXIT_SUCCESS); } if (waitpid(pid, &status, 0) < 0) ksft_exit_fail_msg("Waiting for child %d failed", pid); @@ -309,7 +316,7 @@ int main(int argc, char *argv[]) */ test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); - _exit(ksft_cnt.ksft_pass); + child_exit(ksft_cnt.ksft_pass); } close(pipe_1[1]); -- cgit v1.2.3 From 28df751539e8e3ba71c5b0d13647d1fdc7c1d287 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 17 Nov 2019 22:47:49 -0800 Subject: selftests/clone3: report a correct number of fails In clone3_set_tid, a few test cases are running in a child process. And right now, if one of these test cases fails, the whole test will exit with the success status. Fixes: 41585bbeeef9 ("selftests: add tests for clone3() with *set_tid") Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20191118064750.408003-2-avagin@gmail.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/clone3_set_tid.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index e93369dcfe3b..9c19bae03661 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -316,7 +316,7 @@ int main(int argc, char *argv[]) */ test_clone3_set_tid(set_tid, 3, CLONE_NEWPID, 0, 42, true); - child_exit(ksft_cnt.ksft_pass); + child_exit(ksft_cnt.ksft_fail); } close(pipe_1[1]); @@ -366,12 +366,8 @@ int main(int argc, char *argv[]) if (!WIFEXITED(status)) ksft_test_result_fail("Child error\n"); - if (WEXITSTATUS(status)) - /* - * Update the number of total tests with the tests from the - * child processes. - */ - ksft_cnt.ksft_pass = WEXITSTATUS(status); + ksft_cnt.ksft_pass += 4 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); + ksft_cnt.ksft_fail = WEXITSTATUS(status); if (ns3 == pid && ns2 == 42 && ns1 == 1) ksft_test_result_pass( -- cgit v1.2.3 From a019ff3b8b10d1b7f5cd37edb9f4fbef3e031edf Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Sun, 17 Nov 2019 22:47:50 -0800 Subject: selftests/clone3: check that all pids are released on error paths This is a regression test case for an issue when pids have not been released on error paths. Signed-off-by: Andrei Vagin Link: https://lore.kernel.org/r/20191118064750.408003-3-avagin@gmail.com Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/clone3_set_tid.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index 9c19bae03661..c6309f5d7d88 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -160,7 +160,7 @@ int main(int argc, char *argv[]) ksft_exit_fail_msg("pipe() failed\n"); ksft_print_header(); - ksft_set_plan(27); + ksft_set_plan(29); f = fopen("/proc/sys/kernel/pid_max", "r"); if (f == NULL) @@ -290,6 +290,18 @@ int main(int argc, char *argv[]) /* Let's create a PID 1 */ ns_pid = fork(); if (ns_pid == 0) { + /* + * This and the next test cases check that all pid-s are + * released on error paths. + */ + set_tid[0] = 43; + set_tid[1] = -1; + test_clone3_set_tid(set_tid, 2, 0, -EINVAL, 0, 0); + + set_tid[0] = 43; + set_tid[1] = pid; + test_clone3_set_tid(set_tid, 2, 0, 0, 43, 0); + ksft_print_msg("Child in PID namespace has PID %d\n", getpid()); set_tid[0] = 2; test_clone3_set_tid(set_tid, 1, 0, 0, 2, 0); @@ -366,7 +378,7 @@ int main(int argc, char *argv[]) if (!WIFEXITED(status)) ksft_test_result_fail("Child error\n"); - ksft_cnt.ksft_pass += 4 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); + ksft_cnt.ksft_pass += 6 - (ksft_cnt.ksft_fail - WEXITSTATUS(status)); ksft_cnt.ksft_fail = WEXITSTATUS(status); if (ns3 == pid && ns2 == 42 && ns1 == 1) -- cgit v1.2.3 From 11fde161ab37f2938504bf896b48afbd18ea71cd Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 18 Nov 2019 08:49:44 +0100 Subject: selftests/clone3: skip if clone3() is ENOSYS If the clone3() syscall is not implemented we should skip the tests. Fixes: 41585bbeeef9 ("selftests: add tests for clone3() with *set_tid") Fixes: 17a810699c18 ("selftests: add tests for clone3()") Signed-off-by: Christian Brauner --- tools/testing/selftests/clone3/clone3.c | 1 + .../selftests/clone3/clone3_clear_sighand.c | 29 ++-------------------- tools/testing/selftests/clone3/clone3_selftests.h | 26 +++++++++++++++++++ tools/testing/selftests/clone3/clone3_set_tid.c | 7 +++--- 4 files changed, 33 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/clone3/clone3.c b/tools/testing/selftests/clone3/clone3.c index 4669b3d418e7..f14c269a5a18 100644 --- a/tools/testing/selftests/clone3/clone3.c +++ b/tools/testing/selftests/clone3/clone3.c @@ -131,6 +131,7 @@ int main(int argc, char *argv[]) uid_t uid = getuid(); + test_clone3_supported(); ksft_print_header(); ksft_set_plan(17); diff --git a/tools/testing/selftests/clone3/clone3_clear_sighand.c b/tools/testing/selftests/clone3/clone3_clear_sighand.c index 456783ad19d6..9e1af8aa7698 100644 --- a/tools/testing/selftests/clone3/clone3_clear_sighand.c +++ b/tools/testing/selftests/clone3/clone3_clear_sighand.c @@ -20,32 +20,6 @@ #define CLONE_CLEAR_SIGHAND 0x100000000ULL #endif -static void test_clone3_supported(void) -{ - pid_t pid; - struct clone_args args = {}; - - if (__NR_clone3 < 0) - ksft_exit_skip("clone3() syscall is not supported\n"); - - /* Set to something that will always cause EINVAL. */ - args.exit_signal = -1; - pid = sys_clone3(&args, sizeof(args)); - if (!pid) - exit(EXIT_SUCCESS); - - if (pid > 0) { - wait(NULL); - ksft_exit_fail_msg( - "Managed to create child process with invalid exit_signal\n"); - } - - if (errno == ENOSYS) - ksft_exit_skip("clone3() syscall is not supported\n"); - - ksft_print_msg("clone3() syscall supported\n"); -} - static void nop_handler(int signo) { } @@ -145,9 +119,10 @@ static void test_clone3_clear_sighand(void) int main(int argc, char **argv) { ksft_print_header(); + test_clone3_supported(); + ksft_set_plan(1); - test_clone3_supported(); test_clone3_clear_sighand(); return ksft_exit_pass(); diff --git a/tools/testing/selftests/clone3/clone3_selftests.h b/tools/testing/selftests/clone3/clone3_selftests.h index 0e3dea58855f..a3f2c8ad8bcc 100644 --- a/tools/testing/selftests/clone3/clone3_selftests.h +++ b/tools/testing/selftests/clone3/clone3_selftests.h @@ -34,4 +34,30 @@ static pid_t sys_clone3(struct clone_args *args, size_t size) return syscall(__NR_clone3, args, size); } +static inline void test_clone3_supported(void) +{ + pid_t pid; + struct clone_args args = {}; + + if (__NR_clone3 < 0) + ksft_exit_skip("clone3() syscall is not supported\n"); + + /* Set to something that will always cause EINVAL. */ + args.exit_signal = -1; + pid = sys_clone3(&args, sizeof(args)); + if (!pid) + exit(EXIT_SUCCESS); + + if (pid > 0) { + wait(NULL); + ksft_exit_fail_msg( + "Managed to create child process with invalid exit_signal\n"); + } + + if (errno == ENOSYS) + ksft_exit_skip("clone3() syscall is not supported\n"); + + ksft_print_msg("clone3() syscall supported\n"); +} + #endif /* _CLONE3_SELFTESTS_H */ diff --git a/tools/testing/selftests/clone3/clone3_set_tid.c b/tools/testing/selftests/clone3/clone3_set_tid.c index c6309f5d7d88..25beb22f35b5 100644 --- a/tools/testing/selftests/clone3/clone3_set_tid.c +++ b/tools/testing/selftests/clone3/clone3_set_tid.c @@ -156,12 +156,13 @@ int main(int argc, char *argv[]) pid_t pid, ns1, ns2, ns3, ns_pid; pid_t set_tid[MAX_PID_NS_LEVEL * 2]; - if (pipe(pipe_1) < 0 || pipe(pipe_2) < 0) - ksft_exit_fail_msg("pipe() failed\n"); - ksft_print_header(); + test_clone3_supported(); ksft_set_plan(29); + if (pipe(pipe_1) < 0 || pipe(pipe_2) < 0) + ksft_exit_fail_msg("pipe() failed\n"); + f = fopen("/proc/sys/kernel/pid_max", "r"); if (f == NULL) ksft_exit_fail_msg( -- cgit v1.2.3