summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/pid.h3
-rw-r--r--include/linux/pid_namespace.h10
-rw-r--r--kernel/pid.c125
-rw-r--r--kernel/pid_namespace.c43
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/trace/pid_list.c2
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--tools/testing/selftests/pid_namespace/.gitignore1
-rw-r--r--tools/testing/selftests/pid_namespace/Makefile2
-rw-r--r--tools/testing/selftests/pid_namespace/pid_max.c358
11 files changed, 521 insertions, 36 deletions
diff --git a/include/linux/pid.h b/include/linux/pid.h
index fe575fcdb4af..98837a1ff0f3 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -108,9 +108,6 @@ extern void exchange_tids(struct task_struct *task, struct task_struct *old);
extern void transfer_pid(struct task_struct *old, struct task_struct *new,
enum pid_type);
-extern int pid_max;
-extern int pid_max_min, pid_max_max;
-
/*
* look up a PID in the hash table. Must be called with the tasklist_lock
* or rcu_read_lock() held.
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index f9f9931e02d6..7c67a5811199 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,6 +30,7 @@ struct pid_namespace {
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
+ int pid_max;
struct pid_namespace *parent;
#ifdef CONFIG_BSD_PROCESS_ACCT
struct fs_pin *bacct;
@@ -38,9 +39,14 @@ struct pid_namespace {
struct ucounts *ucounts;
int reboot; /* group exit code if this pidns was rebooted */
struct ns_common ns;
-#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
+ struct work_struct work;
+#ifdef CONFIG_SYSCTL
+ struct ctl_table_set set;
+ struct ctl_table_header *sysctls;
+#if defined(CONFIG_MEMFD_CREATE)
int memfd_noexec_scope;
#endif
+#endif
} __randomize_layout;
extern struct pid_namespace init_pid_ns;
@@ -117,6 +123,8 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
void pidhash_init(void);
void pid_idr_init(void);
+int register_pidns_sysctls(struct pid_namespace *pidns);
+void unregister_pidns_sysctls(struct pid_namespace *pidns);
static inline bool task_is_in_init_pid_ns(struct task_struct *tsk)
{
diff --git a/kernel/pid.c b/kernel/pid.c
index aa2a7d4da455..3a10a7b6fcf8 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -61,10 +61,8 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -83,6 +81,7 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+ .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
@@ -191,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
+ int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@@ -644,17 +644,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
+{
+ return &task_active_pid_ns(current)->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &task_active_pid_ns(current)->set == set;
+}
+
+static int pid_table_root_permissions(struct ctl_table_header *head,
+ const struct ctl_table *table)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ int mode = table->mode;
+
+ if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+ uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXU) >> 6;
+ else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXG) >> 3;
+ else
+ mode = mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void pid_table_root_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(pidns->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(pidns->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root pid_table_root = {
+ .lookup = pid_table_root_lookup,
+ .permissions = pid_table_root_permissions,
+ .set_ownership = pid_table_root_set_ownership,
+};
+
+static struct ctl_table pid_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+};
+#endif
+
+int register_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
+
+ tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
+ if (!tbl)
+ return -ENOMEM;
+ tbl->data = &pidns->pid_max;
+ pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+
+ pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
+ ARRAY_SIZE(pid_table));
+ if (!pidns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&pidns->set);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+void unregister_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+
+ tbl = pidns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(pidns->sysctls);
+ retire_sysctl_set(&pidns->set);
+ kfree(tbl);
+#endif
+}
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
@@ -665,6 +766,16 @@ void __init pid_idr_init(void)
NULL);
}
+static __init int pid_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ /* "kernel" directory will have already been initialized. */
+ BUG_ON(register_pidns_sysctls(&init_pid_ns));
+#endif
+ return 0;
+}
+subsys_initcall(pid_namespace_sysctl_init);
+
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d70ab49d5b4a..f1ffa032fc32 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -70,6 +70,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
+static void destroy_pid_namespace_work(struct work_struct *work);
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
@@ -105,17 +107,27 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
+ ns->pid_max = parent_pid_ns->pid_max;
+ err = register_pidns_sysctls(ns);
+ if (err)
+ goto out_free_inum;
+
refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ INIT_WORK(&ns->work, destroy_pid_namespace_work);
+
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+
return ns;
+out_free_inum:
+ ns_free_inum(&ns->ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -137,12 +149,28 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ unregister_pidns_sysctls(ns);
+
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
+static void destroy_pid_namespace_work(struct work_struct *work)
+{
+ struct pid_namespace *ns =
+ container_of(work, struct pid_namespace, work);
+
+ do {
+ struct pid_namespace *parent;
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+ ns = parent;
+ } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+}
+
struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
@@ -155,15 +183,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
- struct pid_namespace *parent;
-
- while (ns != &init_pid_ns) {
- parent = ns->parent;
- if (!refcount_dec_and_test(&ns->ns.count))
- break;
- destroy_pid_namespace(ns);
- ns = parent;
- }
+ if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+ schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -274,6 +295,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
+ tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1);
@@ -281,7 +303,6 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
-extern int pid_max;
static struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
@@ -289,7 +310,7 @@ static struct ctl_table pid_ns_ctl_table[] = {
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
},
};
#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c9202cb8f59..7ae7a4136855 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = {
},
#endif
{
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
- {
.procname = "panic_on_oops",
.data = &panic_on_oops,
.maxlen = sizeof(int),
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 4966e6bbdf6f..c62b9b3cfb3d 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9691b47b5f3d..179676db622e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -717,8 +717,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 573b5d8e8a28..cb49f7279dc8 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
diff --git a/tools/testing/selftests/pid_namespace/.gitignore b/tools/testing/selftests/pid_namespace/.gitignore
index 93ab9d7e5b7e..5118f0f3edf4 100644
--- a/tools/testing/selftests/pid_namespace/.gitignore
+++ b/tools/testing/selftests/pid_namespace/.gitignore
@@ -1 +1,2 @@
+pid_max
regression_enomem
diff --git a/tools/testing/selftests/pid_namespace/Makefile b/tools/testing/selftests/pid_namespace/Makefile
index 9286a1d22cd3..b972f55d07ae 100644
--- a/tools/testing/selftests/pid_namespace/Makefile
+++ b/tools/testing/selftests/pid_namespace/Makefile
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -g $(KHDR_INCLUDES)
-TEST_GEN_PROGS = regression_enomem
+TEST_GEN_PROGS = regression_enomem pid_max
LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h
diff --git a/tools/testing/selftests/pid_namespace/pid_max.c b/tools/testing/selftests/pid_namespace/pid_max.c
new file mode 100644
index 000000000000..51c414faabb0
--- /dev/null
+++ b/tools/testing/selftests/pid_namespace/pid_max.c
@@ -0,0 +1,358 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#define _GNU_SOURCE
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/types.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <syscall.h>
+#include <sys/wait.h>
+
+#include "../kselftest_harness.h"
+#include "../pidfd/pidfd.h"
+
+#define __STACK_SIZE (8 * 1024 * 1024)
+static pid_t do_clone(int (*fn)(void *), void *arg, int flags)
+{
+ char *stack;
+ pid_t ret;
+
+ stack = malloc(__STACK_SIZE);
+ if (!stack)
+ return -ENOMEM;
+
+#ifdef __ia64__
+ ret = __clone2(fn, stack, __STACK_SIZE, flags | SIGCHLD, arg);
+#else
+ ret = clone(fn, stack + __STACK_SIZE, flags | SIGCHLD, arg);
+#endif
+ free(stack);
+ return ret;
+}
+
+static int pid_max_cb(void *data)
+{
+ int fd, ret;
+ pid_t pid;
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+ return -1;
+ }
+
+ umount2("/proc", MNT_DETACH);
+
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to mount proc\n");
+ return -1;
+ }
+
+ fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+ if (fd < 0) {
+ fprintf(stderr, "%m - Failed to open pid_max\n");
+ return -1;
+ }
+
+ ret = write(fd, "500", sizeof("500") - 1);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to write pid_max\n");
+ return -1;
+ }
+
+ for (int i = 0; i < 501; i++) {
+ pid = fork();
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+ wait_for_pid(pid);
+ if (pid > 500) {
+ fprintf(stderr, "Managed to create pid number beyond limit\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int pid_max_nested_inner(void *data)
+{
+ int fret = -1;
+ pid_t pids[2];
+ int fd, i, ret;
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+ return fret;
+ }
+
+ umount2("/proc", MNT_DETACH);
+
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to mount proc\n");
+ return fret;
+ }
+
+ fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+ if (fd < 0) {
+ fprintf(stderr, "%m - Failed to open pid_max\n");
+ return fret;
+ }
+
+ ret = write(fd, "500", sizeof("500") - 1);
+ close(fd);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to write pid_max\n");
+ return fret;
+ }
+
+ pids[0] = fork();
+ if (pids[0] < 0) {
+ fprintf(stderr, "Failed to create first new process\n");
+ return fret;
+ }
+
+ if (pids[0] == 0)
+ exit(EXIT_SUCCESS);
+
+ pids[1] = fork();
+ wait_for_pid(pids[0]);
+ if (pids[1] >= 0) {
+ if (pids[1] == 0)
+ exit(EXIT_SUCCESS);
+ wait_for_pid(pids[1]);
+
+ fprintf(stderr, "Managed to create process even though ancestor pid namespace had a limit\n");
+ return fret;
+ }
+
+ /* Now make sure that we wrap pids at 400. */
+ for (i = 0; i < 510; i++) {
+ pid_t pid;
+
+ pid = fork();
+ if (pid < 0)
+ return fret;
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ wait_for_pid(pid);
+ if (pid >= 500) {
+ fprintf(stderr, "Managed to create process with pid %d beyond configured limit\n", pid);
+ return fret;
+ }
+ }
+
+ return 0;
+}
+
+static int pid_max_nested_outer(void *data)
+{
+ int fret = -1, nr_procs = 400;
+ pid_t pids[1000];
+ int fd, i, ret;
+ pid_t pid;
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+ return fret;
+ }
+
+ umount2("/proc", MNT_DETACH);
+
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to mount proc\n");
+ return fret;
+ }
+
+ fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+ if (fd < 0) {
+ fprintf(stderr, "%m - Failed to open pid_max\n");
+ return fret;
+ }
+
+ ret = write(fd, "400", sizeof("400") - 1);
+ close(fd);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to write pid_max\n");
+ return fret;
+ }
+
+ /*
+ * Create 397 processes. This leaves room for do_clone() (398) and
+ * one more 399. So creating another process needs to fail.
+ */
+ for (nr_procs = 0; nr_procs < 396; nr_procs++) {
+ pid = fork();
+ if (pid < 0)
+ goto reap;
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ pids[nr_procs] = pid;
+ }
+
+ pid = do_clone(pid_max_nested_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+ if (pid < 0) {
+ fprintf(stderr, "%m - Failed to clone nested pidns\n");
+ goto reap;
+ }
+
+ if (wait_for_pid(pid)) {
+ fprintf(stderr, "%m - Nested pid_max failed\n");
+ goto reap;
+ }
+
+ fret = 0;
+
+reap:
+ for (int i = 0; i < nr_procs; i++)
+ wait_for_pid(pids[i]);
+
+ return fret;
+}
+
+static int pid_max_nested_limit_inner(void *data)
+{
+ int fret = -1, nr_procs = 400;
+ int fd, ret;
+ pid_t pid;
+ pid_t pids[1000];
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+ return fret;
+ }
+
+ umount2("/proc", MNT_DETACH);
+
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to mount proc\n");
+ return fret;
+ }
+
+ fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+ if (fd < 0) {
+ fprintf(stderr, "%m - Failed to open pid_max\n");
+ return fret;
+ }
+
+ ret = write(fd, "500", sizeof("500") - 1);
+ close(fd);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to write pid_max\n");
+ return fret;
+ }
+
+ for (nr_procs = 0; nr_procs < 500; nr_procs++) {
+ pid = fork();
+ if (pid < 0)
+ break;
+
+ if (pid == 0)
+ exit(EXIT_SUCCESS);
+
+ pids[nr_procs] = pid;
+ }
+
+ if (nr_procs >= 400) {
+ fprintf(stderr, "Managed to create processes beyond the configured outer limit\n");
+ goto reap;
+ }
+
+ fret = 0;
+
+reap:
+ for (int i = 0; i < nr_procs; i++)
+ wait_for_pid(pids[i]);
+
+ return fret;
+}
+
+static int pid_max_nested_limit_outer(void *data)
+{
+ int fd, ret;
+ pid_t pid;
+
+ ret = mount("", "/", NULL, MS_PRIVATE | MS_REC, 0);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to make rootfs private mount\n");
+ return -1;
+ }
+
+ umount2("/proc", MNT_DETACH);
+
+ ret = mount("proc", "/proc", "proc", 0, NULL);
+ if (ret) {
+ fprintf(stderr, "%m - Failed to mount proc\n");
+ return -1;
+ }
+
+ fd = open("/proc/sys/kernel/pid_max", O_RDWR | O_CLOEXEC | O_NOCTTY);
+ if (fd < 0) {
+ fprintf(stderr, "%m - Failed to open pid_max\n");
+ return -1;
+ }
+
+ ret = write(fd, "400", sizeof("400") - 1);
+ close(fd);
+ if (ret < 0) {
+ fprintf(stderr, "%m - Failed to write pid_max\n");
+ return -1;
+ }
+
+ pid = do_clone(pid_max_nested_limit_inner, NULL, CLONE_NEWPID | CLONE_NEWNS);
+ if (pid < 0) {
+ fprintf(stderr, "%m - Failed to clone nested pidns\n");
+ return -1;
+ }
+
+ if (wait_for_pid(pid)) {
+ fprintf(stderr, "%m - Nested pid_max failed\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+TEST(pid_max_simple)
+{
+ pid_t pid;
+
+
+ pid = do_clone(pid_max_cb, NULL, CLONE_NEWPID | CLONE_NEWNS);
+ ASSERT_GT(pid, 0);
+ ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested_limit)
+{
+ pid_t pid;
+
+ pid = do_clone(pid_max_nested_limit_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+ ASSERT_GT(pid, 0);
+ ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST(pid_max_nested)
+{
+ pid_t pid;
+
+ pid = do_clone(pid_max_nested_outer, NULL, CLONE_NEWPID | CLONE_NEWNS);
+ ASSERT_GT(pid, 0);
+ ASSERT_EQ(0, wait_for_pid(pid));
+}
+
+TEST_HARNESS_MAIN