summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-01-30 06:38:34 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2020-01-30 06:38:34 +0300
commit83fa805bcbfc53ae82eedd65132794ae324798e5 (patch)
treeff4b2ba048bb5f14194110aedb09c85aab159d4a /include
parent896f8d23d0cb5889021d66eab6107e97109c5459 (diff)
parent8d19f1c8e1937baf74e1962aae9f90fa3aeab463 (diff)
downloadlinux-83fa805bcbfc53ae82eedd65132794ae324798e5.tar.xz
Merge tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull thread management updates from Christian Brauner: "Sargun Dhillon over the last cycle has worked on the pidfd_getfd() syscall. This syscall allows for the retrieval of file descriptors of a process based on its pidfd. A task needs to have ptrace_may_access() permissions with PTRACE_MODE_ATTACH_REALCREDS (suggested by Oleg and Andy) on the target. One of the main use-cases is in combination with seccomp's user notification feature. As a reminder, seccomp's user notification feature was made available in v5.0. It allows a task to retrieve a file descriptor for its seccomp filter. The file descriptor is usually handed of to a more privileged supervising process. The supervisor can then listen for syscall events caught by the seccomp filter of the supervisee and perform actions in lieu of the supervisee, usually emulating syscalls. pidfd_getfd() is needed to expand its uses. There are currently two major users that wait on pidfd_getfd() and one future user: - Netflix, Sargun said, is working on a service mesh where users should be able to connect to a dns-based VIP. When a user connects to e.g. 1.2.3.4:80 that runs e.g. service "foo" they will be redirected to an envoy process. This service mesh uses seccomp user notifications and pidfd to intercept all connect calls and instead of connecting them to 1.2.3.4:80 connects them to e.g. 127.0.0.1:8080. - LXD uses the seccomp notifier heavily to intercept and emulate mknod() and mount() syscalls for unprivileged containers/processes. With pidfd_getfd() more uses-cases e.g. bridging socket connections will be possible. - The patchset has also seen some interest from the browser corner. Right now, Firefox is using a SECCOMP_RET_TRAP sandbox managed by a broker process. In the future glibc will start blocking all signals during dlopen() rendering this type of sandbox impossible. Hence, in the future Firefox will switch to a seccomp-user-nofication based sandbox which also makes use of file descriptor retrieval. The thread for this can be found at https://sourceware.org/ml/libc-alpha/2019-12/msg00079.html With pidfd_getfd() it is e.g. possible to bridge socket connections for the supervisee (binding to a privileged port) and taking actions on file descriptors on behalf of the supervisee in general. Sargun's first version was using an ioctl on pidfds but various people pushed for it to be a proper syscall which he duely implemented as well over various review cycles. Selftests are of course included. I've also added instructions how to deal with merge conflicts below. There's also a small fix coming from the kernel mentee project to correctly annotate struct sighand_struct with __rcu to fix various sparse warnings. We've received a few more such fixes and even though they are mostly trivial I've decided to postpone them until after -rc1 since they came in rather late and I don't want to risk introducing build warnings. Finally, there's a new prctl() command PR_{G,S}ET_IO_FLUSHER which is needed to avoid allocation recursions triggerable by storage drivers that have userspace parts that run in the IO path (e.g. dm-multipath, iscsi, etc). These allocation recursions deadlock the device. The new prctl() allows such privileged userspace components to avoid allocation recursions by setting the PF_MEMALLOC_NOIO and PF_LESS_THROTTLE flags. The patch carries the necessary acks from the relevant maintainers and is routed here as part of prctl() thread-management." * tag 'threads-v5.6' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: prctl: PR_{G,S}ET_IO_FLUSHER to support controlling memory reclaim sched.h: Annotate sighand_struct with __rcu test: Add test for pidfd getfd arch: wire up pidfd_getfd syscall pid: Implement pidfd_getfd syscall vfs, fdtable: Add fget_task helper
Diffstat (limited to 'include')
-rw-r--r--include/linux/file.h2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/uapi/asm-generic/unistd.h4
-rw-r--r--include/uapi/linux/capability.h1
-rw-r--r--include/uapi/linux/prctl.h4
6 files changed, 12 insertions, 2 deletions
diff --git a/include/linux/file.h b/include/linux/file.h
index 3fcddff56bc4..c6c7b24ea9f7 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -16,6 +16,7 @@ extern void fput(struct file *);
extern void fput_many(struct file *, unsigned int);
struct file_operations;
+struct task_struct;
struct vfsmount;
struct dentry;
struct inode;
@@ -47,6 +48,7 @@ static inline void fdput(struct fd fd)
extern struct file *fget(unsigned int fd);
extern struct file *fget_many(unsigned int fd, unsigned int refs);
extern struct file *fget_raw(unsigned int fd);
+extern struct file *fget_task(struct task_struct *task, unsigned int fd);
extern unsigned long __fdget(unsigned int fd);
extern unsigned long __fdget_raw(unsigned int fd);
extern unsigned long __fdget_pos(unsigned int fd);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 716ad1d8d95e..04278493bf15 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -917,7 +917,7 @@ struct task_struct {
/* Signal handlers: */
struct signal_struct *signal;
- struct sighand_struct *sighand;
+ struct sighand_struct __rcu *sighand;
sigset_t blocked;
sigset_t real_blocked;
/* Restored if set_restore_sigmask() was used: */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ac3a663137b6..1815065d52f3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1002,6 +1002,7 @@ asmlinkage long sys_fspick(int dfd, const char __user *path, unsigned int flags)
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
siginfo_t __user *info,
unsigned int flags);
+asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d4122c091472..3a3201e4618e 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -853,9 +853,11 @@ __SYSCALL(__NR_clone3, sys_clone3)
#define __NR_openat2 437
__SYSCALL(__NR_openat2, sys_openat2)
+#define __NR_pidfd_getfd 438
+__SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
#undef __NR_syscalls
-#define __NR_syscalls 438
+#define __NR_syscalls 439
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..272dc69fa080 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -301,6 +301,7 @@ struct vfs_ns_cap_data {
/* Allow more than 64hz interrupts from the real-time clock */
/* Override max number of consoles on console allocation */
/* Override max number of keymaps */
+/* Control memory reclaim behavior */
#define CAP_SYS_RESOURCE 24
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 7da1b37b27aa..07b4f8131e36 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -234,4 +234,8 @@ struct prctl_mm_map {
#define PR_GET_TAGGED_ADDR_CTRL 56
# define PR_TAGGED_ADDR_ENABLE (1UL << 0)
+/* Control reclaim behavior when allocating memory */
+#define PR_SET_IO_FLUSHER 57
+#define PR_GET_IO_FLUSHER 58
+
#endif /* _LINUX_PRCTL_H */