Merge tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux

Pull io_uring bpf filters from Jens Axboe: "This adds support for both cBPF filters for io_uring, as well as task inherited restrictions and filters. seccomp and io_uring don't play along nicely, as most of the interesting data to filter on resides somewhat out-of-band, in the submission queue ring. As a result, things like containers and systemd that apply seccomp filters, can't filter io_uring operations. That leaves them with just one choice if filtering is critical - filter the actual io_uring_setup(2) system call to simply disallow io_uring. That's rather unfortunate, and has limited us because of it. io_uring already has some filtering support. It requires the ring to be setup in a disabled state, and then a filter set can be applied. This filter set is completely bi-modal - an opcode is either enabled or it's not. Once a filter set is registered, the ring can be enabled. This is very restrictive, and it's not useful at all to systemd or containers which really want both broader and more specific control. This first adds support for cBPF filters for opcodes, which enables tighter control over what exactly a specific opcode may do. As examples, specific support is added for IORING_OP_OPENAT/OPENAT2, allowing filtering on resolve flags. And another example is added for IORING_OP_SOCKET, allowing filtering on domain/type/protocol. These are both common use cases. cBPF was chosen rather than eBPF, because the latter is often restricted in containers as well. These filters are run post the init phase of the request, which allows filters to even dip into data that is being passed in struct in user memory, as the init side of requests make that data stable by bringing it into the kernel. This allows filtering without needing to copy this data twice, or have filters etc know about the exact layout of the user data. The filters get the already copied and sanitized data passed. On top of that support is added for per-task filters, meaning that any ring created with a task that has a per-task filter will get those filters applied when it's created. These filters are inherited across fork as well. Once a filter has been registered, any further added filters may only further restrict what operations are permitted. Filters cannot change the return value of an operation, they can only permit or deny it based on the contents" * tag 'io_uring-bpf-restrictions.4-20260206' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: io_uring: allow registration of per-task restrictions io_uring: add task fork hook io_uring/bpf_filter: add ref counts to struct io_bpf_filter io_uring/bpf_filter: cache lookup table in ctx->bpf_filters io_uring/bpf_filter: allow filtering on contents of struct open_how io_uring/net: allow filtering on IORING_OP_SOCKET data io_uring: add support for BPF filtering for opcode restrictions
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-10 04:31:17 +0300
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-02-10 04:31:17 +0300
commit: 591beb0e3a03258ef9c01893a5209845799a7c33 (patch)
tree: 3f4289e15f07689f89e4777cd93eb1b49e289ea5 /include/linux
parent: f5d4feed174ce9fb3c42886a3c36038fd5a43e25 (diff)
parent: ed82f35b926b2e505c14b7006473614b8f58b4f4 (diff)
download: linux-591beb0e3a03258ef9c01893a5209845799a7c33.tar.xz
3 files changed, 27 insertions, 1 deletions
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index 85fe4e6b275c..d1aa4edfc2a5 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -12,6 +12,7 @@ void __io_uring_free(struct task_struct *tsk);
 void io_uring_unreg_ringfd(void);
 const char *io_uring_get_opcode(u8 opcode);
 bool io_is_uring_fops(struct file *file);
+int __io_uring_fork(struct task_struct *tsk);
 
 static inline void io_uring_files_cancel(void)
 {
@@ -25,9 +26,16 @@ static inline void io_uring_task_cancel(void)
 }
 static inline void io_uring_free(struct task_struct *tsk)
 {
-	if (tsk->io_uring)
+	if (tsk->io_uring || tsk->io_uring_restrict)
 		__io_uring_free(tsk);
 }
+static inline int io_uring_fork(struct task_struct *tsk)
+{
+	if (tsk->io_uring_restrict)
+		return __io_uring_fork(tsk);
+
+	return 0;
+}
 #else
 static inline void io_uring_task_cancel(void)
 {
@@ -46,6 +54,10 @@ static inline bool io_is_uring_fops(struct file *file)
 {
 	return false;
 }
+static inline int io_uring_fork(struct task_struct *tsk)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 544f78e3ca32..3e4a82a6f817 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -219,9 +219,20 @@ struct io_rings {
 	struct io_uring_cqe	cqes[] ____cacheline_aligned_in_smp;
 };
 
+struct io_bpf_filter;
+struct io_bpf_filters {
+	refcount_t refs;	/* ref for ->bpf_filters */
+	spinlock_t lock;	/* protects ->bpf_filters modifications */
+	struct io_bpf_filter __rcu **filters;
+	struct rcu_head rcu_head;
+};
+
 struct io_restriction {
 	DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
 	DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+	struct io_bpf_filters *bpf_filters;
+	/* ->bpf_filters needs COW on modification */
+	bool bpf_filters_cow;
 	u8 sqe_flags_allowed;
 	u8 sqe_flags_required;
 	/* IORING_OP_* restrictions exist */
@@ -278,6 +289,8 @@ struct io_ring_ctx {
 
 		struct task_struct	*submitter_task;
 		struct io_rings		*rings;
+		/* cache of ->restrictions.bpf_filters->filters */
+		struct io_bpf_filter __rcu	**bpf_filters;
 		struct percpu_ref	refs;
 
 		clockid_t		clockid;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e4d1f4481866..0ef3325a39eb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1186,6 +1186,7 @@ struct task_struct {
 
 #ifdef CONFIG_IO_URING
 	struct io_uring_task		*io_uring;
+	struct io_restriction		*io_uring_restrict;
 #endif
 
 	/* Namespaces: */
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-10 04:31:17 +0300
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-02-10 04:31:17 +0300
commit	591beb0e3a03258ef9c01893a5209845799a7c33 (patch)
tree	3f4289e15f07689f89e4777cd93eb1b49e289ea5 /include/linux
parent	f5d4feed174ce9fb3c42886a3c36038fd5a43e25 (diff)
parent	ed82f35b926b2e505c14b7006473614b8f58b4f4 (diff)
download	linux-591beb0e3a03258ef9c01893a5209845799a7c33.tar.xz