summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-04-24 22:52:35 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2023-04-24 22:52:35 +0300
commit3323ddce085cdb33331c2c1bb7a88233023566a9 (patch)
tree8954fe69a603a4980b3cf8b4a6a2a4012f3fe49d /kernel
parenta632b76b427d886911221331f4bfcd44a3e58197 (diff)
parent6e890c5d5021ca7e69bbe203fde42447874d9a82 (diff)
downloadlinux-3323ddce085cdb33331c2c1bb7a88233023566a9.tar.xz
Merge tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull user work thread updates from Christian Brauner: "This contains the work generalizing the ability to create a kernel worker from a userspace process. Such user workers will run with the same credentials as the userspace process they were created from providing stronger security and accounting guarantees than the traditional override_creds() approach ever could've hoped for. The original work was heavily based and optimzed for the needs of io_uring which was the first user. However, as it quickly turned out the ability to create user workers inherting properties from a userspace process is generally useful. The vhost subsystem currently creates workers using the kthread api. The consequences of using the kthread api are that RLIMITs don't work correctly as they are inherited from khtreadd. This leads to bugs where more workers are created than would be allowed by the RLIMITs of the userspace process in lieu of which workers are created. Problems like this disappear with user workers created from the userspace processes for which they perform the work. In addition, providing this api allows vhost to remove additional complexity. For example, cgroup and mm sharing will just work out of the box with user workers based on the relevant userspace process instead of manually ensuring the correct cgroup and mm contexts are used. So the vhost subsystem should simply be made to use the same mechanism as io_uring. To this end the original mechanism used for create_io_thread() is generalized into user workers: - Introduce PF_USER_WORKER as a generic indicator that a given task is a user worker, i.e., a kernel task that was created from a userspace process. Now a PF_IO_WORKER thread is just a specialized version of PF_USER_WORKER. So io_uring io workers raise both flags. - Make copy_process() available to core kernel code - Extend struct kernel_clone_args with the following bitfields allowing to indicate to copy_process(): - to create a user worker (raise PF_USER_WORKER) - to not inherit any files from the userspace process - to ignore signals After all generic changes are in place the vhost subsystem implements a new dedicated vhost api based on user workers. Finally, vhost is switched to rely on the new api moving it off of kthreads. Thanks to Mike for sticking it out and making it through this rather arduous journey" * tag 'v6.4/kernel.user_worker' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: vhost: use vhost_tasks for worker threads vhost: move worker thread fields to new struct vhost_task: Allow vhost layer to use copy_process fork: allow kernel code to call copy_process fork: Add kernel_clone_args flag to ignore signals fork: add kernel_clone_args flag to not dup/clone files fork/vm: Move common PF_IO_WORKER behavior to new flag kernel: Make io_thread and kthread bit fields kthread: Pass in the thread's name during creation kernel: Allow a kernel thread's name to be set in copy_process csky: Remove kernel_thread declaration
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/kthread.c33
-rw-r--r--kernel/vhost_task.c117
4 files changed, 151 insertions, 25 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 10ef068f598d..6fc72b3afbde 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -15,6 +15,7 @@ obj-y = fork.o exec_domain.o panic.o \
obj-$(CONFIG_USERMODE_DRIVER) += usermode_driver.o
obj-$(CONFIG_MODULES) += kmod.o
obj-$(CONFIG_MULTIUSER) += groups.o
+obj-$(CONFIG_VHOST_TASK) += vhost_task.o
ifdef CONFIG_FUNCTION_TRACER
# Do not trace internal ftrace files
diff --git a/kernel/fork.c b/kernel/fork.c
index ea332319dffe..d6cd5849eb51 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1626,7 +1626,8 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
{
struct files_struct *oldf, *newf;
int error = 0;
@@ -1638,6 +1639,11 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
if (!oldf)
goto out;
+ if (no_files) {
+ tsk->files = NULL;
+ goto out;
+ }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(&oldf->count);
goto out;
@@ -2009,7 +2015,7 @@ static void rv_task_fork(struct task_struct *p)
* parts of the process environment (as per the clone
* flags). The actual kick-off is left to the caller.
*/
-static __latent_entropy struct task_struct *copy_process(
+__latent_entropy struct task_struct *copy_process(
struct pid *pid,
int trace,
int node,
@@ -2102,6 +2108,8 @@ static __latent_entropy struct task_struct *copy_process(
p->flags &= ~PF_KTHREAD;
if (args->kthread)
p->flags |= PF_KTHREAD;
+ if (args->user_worker)
+ p->flags |= PF_USER_WORKER;
if (args->io_thread) {
/*
* Mark us an IO worker, and block any signal that isn't
@@ -2111,6 +2119,9 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
+ if (args->name)
+ strscpy_pad(p->comm, args->name, sizeof(p->comm));
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@@ -2253,7 +2264,7 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
- retval = copy_files(clone_flags, p);
+ retval = copy_files(clone_flags, p, args->no_files);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
@@ -2278,6 +2289,9 @@ static __latent_entropy struct task_struct *copy_process(
if (retval)
goto bad_fork_cleanup_io;
+ if (args->ignore_signals)
+ ignore_signals(p);
+
stackleak_task_init(p);
if (pid != &init_struct_pid) {
@@ -2626,6 +2640,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
+ .user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
@@ -2729,7 +2744,8 @@ pid_t kernel_clone(struct kernel_clone_args *args)
/*
* Create a kernel thread.
*/
-pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
+pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name,
+ unsigned long flags)
{
struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
@@ -2737,6 +2753,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
+ .name = name,
.kthread = 1,
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 7e6751b29101..4bc6e0971ec9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -38,6 +38,7 @@ struct task_struct *kthreadd_task;
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
+ char *full_name;
int (*threadfn)(void *data);
void *data;
int node;
@@ -343,10 +344,12 @@ static int kthread(void *_create)
/* Release the structure when caller killed by a fatal signal. */
done = xchg(&create->done, NULL);
if (!done) {
+ kfree(create->full_name);
kfree(create);
kthread_exit(-EINTR);
}
+ self->full_name = create->full_name;
self->threadfn = threadfn;
self->data = data;
@@ -396,11 +399,13 @@ static void create_kthread(struct kthread_create_info *create)
current->pref_node_fork = create->node;
#endif
/* We want our own signal handler (we take no signals by default). */
- pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
+ pid = kernel_thread(kthread, create, create->full_name,
+ CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
/* Release the structure when caller killed by a fatal signal. */
struct completion *done = xchg(&create->done, NULL);
+ kfree(create->full_name);
if (!done) {
kfree(create);
return;
@@ -427,6 +432,11 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
create->data = data;
create->node = node;
create->done = &done;
+ create->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
+ if (!create->full_name) {
+ task = ERR_PTR(-ENOMEM);
+ goto free_create;
+ }
spin_lock(&kthread_create_lock);
list_add_tail(&create->list, &kthread_create_list);
@@ -453,26 +463,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
wait_for_completion(&done);
}
task = create->result;
- if (!IS_ERR(task)) {
- char name[TASK_COMM_LEN];
- va_list aq;
- int len;
-
- /*
- * task is already visible to other tasks, so updating
- * COMM must be protected.
- */
- va_copy(aq, args);
- len = vsnprintf(name, sizeof(name), namefmt, aq);
- va_end(aq);
- if (len >= TASK_COMM_LEN) {
- struct kthread *kthread = to_kthread(task);
-
- /* leave it truncated when out of memory. */
- kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
- }
- set_task_comm(task, name);
- }
+free_create:
kfree(create);
return task;
}
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
new file mode 100644
index 000000000000..b7cbd66f889e
--- /dev/null
+++ b/kernel/vhost_task.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Oracle Corporation
+ */
+#include <linux/slab.h>
+#include <linux/completion.h>
+#include <linux/sched/task.h>
+#include <linux/sched/vhost_task.h>
+#include <linux/sched/signal.h>
+
+enum vhost_task_flags {
+ VHOST_TASK_FLAGS_STOP,
+};
+
+static int vhost_task_fn(void *data)
+{
+ struct vhost_task *vtsk = data;
+ int ret;
+
+ ret = vtsk->fn(vtsk->data);
+ complete(&vtsk->exited);
+ do_exit(ret);
+}
+
+/**
+ * vhost_task_stop - stop a vhost_task
+ * @vtsk: vhost_task to stop
+ *
+ * Callers must call vhost_task_should_stop and return from their worker
+ * function when it returns true;
+ */
+void vhost_task_stop(struct vhost_task *vtsk)
+{
+ pid_t pid = vtsk->task->pid;
+
+ set_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+ wake_up_process(vtsk->task);
+ /*
+ * Make sure vhost_task_fn is no longer accessing the vhost_task before
+ * freeing it below. If userspace crashed or exited without closing,
+ * then the vhost_task->task could already be marked dead so
+ * kernel_wait will return early.
+ */
+ wait_for_completion(&vtsk->exited);
+ /*
+ * If we are just closing/removing a device and the parent process is
+ * not exiting then reap the task.
+ */
+ kernel_wait4(pid, NULL, __WCLONE, NULL);
+ kfree(vtsk);
+}
+EXPORT_SYMBOL_GPL(vhost_task_stop);
+
+/**
+ * vhost_task_should_stop - should the vhost task return from the work function
+ * @vtsk: vhost_task to stop
+ */
+bool vhost_task_should_stop(struct vhost_task *vtsk)
+{
+ return test_bit(VHOST_TASK_FLAGS_STOP, &vtsk->flags);
+}
+EXPORT_SYMBOL_GPL(vhost_task_should_stop);
+
+/**
+ * vhost_task_create - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @name: the thread's name
+ *
+ * This returns a specialized task for use by the vhost layer or NULL on
+ * failure. The returned task is inactive, and the caller must fire it up
+ * through vhost_task_start().
+ */
+struct vhost_task *vhost_task_create(int (*fn)(void *), void *arg,
+ const char *name)
+{
+ struct kernel_clone_args args = {
+ .flags = CLONE_FS | CLONE_UNTRACED | CLONE_VM,
+ .exit_signal = 0,
+ .fn = vhost_task_fn,
+ .name = name,
+ .user_worker = 1,
+ .no_files = 1,
+ .ignore_signals = 1,
+ };
+ struct vhost_task *vtsk;
+ struct task_struct *tsk;
+
+ vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
+ if (!vtsk)
+ return NULL;
+ init_completion(&vtsk->exited);
+ vtsk->data = arg;
+ vtsk->fn = fn;
+
+ args.fn_arg = vtsk;
+
+ tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
+ if (IS_ERR(tsk)) {
+ kfree(vtsk);
+ return NULL;
+ }
+
+ vtsk->task = tsk;
+ return vtsk;
+}
+EXPORT_SYMBOL_GPL(vhost_task_create);
+
+/**
+ * vhost_task_start - start a vhost_task created with vhost_task_create
+ * @vtsk: vhost_task to wake up
+ */
+void vhost_task_start(struct vhost_task *vtsk)
+{
+ wake_up_new_task(vtsk->task);
+}
+EXPORT_SYMBOL_GPL(vhost_task_start);