summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 22:27:39 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 22:27:39 +0300
commit1b074abe885f43b2c207b5e748ffa60604dbc020 (patch)
tree5b0b973950d2a4e20d0b4c4544eed8188fe0814e /fs
parentf608cabaeda472887c008e42398e8fca14e8f411 (diff)
parentca567df74a28a9fb368c6b2d93e864113f73f5c2 (diff)
downloadlinux-1b074abe885f43b2c207b5e748ffa60604dbc020.tar.xz
Merge tag 'vfs-6.11.nsfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull namespace-fs updates from Christian Brauner: "This adds ioctls allowing to translate PIDs between PID namespaces. The motivating use-case comes from LXCFS which is a tiny fuse filesystem used to virtualize various aspects of procfs. LXCFS is run on the host. The files and directories it creates can be bind-mounted by e.g. a container at startup and mounted over the various procfs files the container wishes to have virtualized. When e.g. a read request for uptime is received, LXCFS will receive the pid of the reader. In order to virtualize the corresponding read, LXCFS needs to know the pid of the init process of the reader's pid namespace. In order to do this, LXCFS first needs to fork() two helper processes. The first helper process setns() to the readers pid namespace. The second helper process is needed to create a process that is a proper member of the pid namespace. The second helper process then creates a ucred message with ucred.pid set to 1 and sends it back to LXCFS. The kernel will translate the ucred.pid field to the corresponding pid number in LXCFS's pid namespace. This way LXCFS can learn the init pid number of the reader's pid namespace and can go on to virtualize. Since these two forks() are costly LXCFS maintains an init pid cache that caches a given pid for a fixed amount of time. The cache is pruned during new read requests. However, even with the cache the hit of the two forks() is singificant when a very large number of containers are running. So this adds a simple set of ioctls that let's a caller translate PIDs from and into a given PID namespace. This significantly improves performance with a very simple change. To protect against races pidfds can be used to check whether the process is still valid" * tag 'vfs-6.11.nsfs' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: nsfs: add pid translation ioctls
Diffstat (limited to 'fs')
-rw-r--r--fs/nsfs.c53
1 files changed, 52 insertions, 1 deletions
diff --git a/fs/nsfs.c b/fs/nsfs.c
index af352dadffe1..ad6bb91a3e23 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -8,6 +8,7 @@
#include <linux/magic.h>
#include <linux/ktime.h>
#include <linux/seq_file.h>
+#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
#include <linux/nsfs.h>
#include <linux/uaccess.h>
@@ -124,9 +125,12 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
unsigned long arg)
{
struct user_namespace *user_ns;
+ struct pid_namespace *pid_ns;
+ struct task_struct *tsk;
struct ns_common *ns = get_proc_ns(file_inode(filp));
uid_t __user *argp;
uid_t uid;
+ int ret;
switch (ioctl) {
case NS_GET_USERNS:
@@ -157,9 +161,56 @@ static long ns_ioctl(struct file *filp, unsigned int ioctl,
id = mnt_ns->seq;
return put_user(id, idp);
}
+ case NS_GET_PID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_FROM_PIDNS:
+ fallthrough;
+ case NS_GET_PID_IN_PIDNS:
+ fallthrough;
+ case NS_GET_TGID_IN_PIDNS:
+ if (ns->ops->type != CLONE_NEWPID)
+ return -EINVAL;
+
+ ret = -ESRCH;
+ pid_ns = container_of(ns, struct pid_namespace, ns);
+
+ rcu_read_lock();
+
+ if (ioctl == NS_GET_PID_IN_PIDNS ||
+ ioctl == NS_GET_TGID_IN_PIDNS)
+ tsk = find_task_by_vpid(arg);
+ else
+ tsk = find_task_by_pid_ns(arg, pid_ns);
+ if (!tsk)
+ break;
+
+ switch (ioctl) {
+ case NS_GET_PID_FROM_PIDNS:
+ ret = task_pid_vnr(tsk);
+ break;
+ case NS_GET_TGID_FROM_PIDNS:
+ ret = task_tgid_vnr(tsk);
+ break;
+ case NS_GET_PID_IN_PIDNS:
+ ret = task_pid_nr_ns(tsk, pid_ns);
+ break;
+ case NS_GET_TGID_IN_PIDNS:
+ ret = task_tgid_nr_ns(tsk, pid_ns);
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ rcu_read_unlock();
+
+ if (!ret)
+ ret = -ESRCH;
+ break;
default:
- return -ENOTTY;
+ ret = -ENOTTY;
}
+
+ return ret;
}
int ns_get_name(char *buf, size_t size, struct task_struct *task,