From f1a424e21c15993db0f9594cda17ef5d516ab3e9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 14 Mar 2026 08:41:04 -0600 Subject: io_uring: switch struct io_ring_ctx internal bitfields to flags Bitfields cannot be set and checked atomically, and this makes it more clear that these are indeed in shared storage and must be checked and set in a sane fashion. This is in preparation for annotating a few of the known racy, but harmless, flags checking. No intended functional changes in this patch. Reviewed-by: Gabriel Krisman Bertazi Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index dd1420bfcb73..0b3f08adc217 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -268,24 +268,30 @@ struct io_alloc_cache { unsigned int init_clear; }; +enum { + IO_RING_F_DRAIN_NEXT = BIT(0), + IO_RING_F_OP_RESTRICTED = BIT(1), + IO_RING_F_REG_RESTRICTED = BIT(2), + IO_RING_F_OFF_TIMEOUT_USED = BIT(3), + IO_RING_F_DRAIN_ACTIVE = BIT(4), + IO_RING_F_HAS_EVFD = BIT(5), + /* all CQEs should be posted only by the submitter task */ + IO_RING_F_TASK_COMPLETE = BIT(6), + IO_RING_F_LOCKLESS_CQ = BIT(7), + IO_RING_F_SYSCALL_IOPOLL = BIT(8), + IO_RING_F_POLL_ACTIVATED = BIT(9), + IO_RING_F_DRAIN_DISABLED = BIT(10), + IO_RING_F_COMPAT = BIT(11), + IO_RING_F_IOWQ_LIMITS_SET = BIT(12), +}; + struct io_ring_ctx { /* const or read-mostly hot data */ struct { + /* ring setup flags */ unsigned int flags; - unsigned int drain_next: 1; - unsigned int op_restricted: 1; - unsigned int reg_restricted: 1; - unsigned int off_timeout_used: 1; - unsigned int drain_active: 1; - unsigned int has_evfd: 1; - /* all CQEs should be posted only by the submitter task */ - unsigned int task_complete: 1; - unsigned int lockless_cq: 1; - unsigned int syscall_iopoll: 1; - unsigned int poll_activated: 1; - unsigned int drain_disabled: 1; - unsigned int compat: 1; - unsigned int iowq_limits_set : 1; + /* internal state flags IO_RING_F_* flags , mostly read-only */ + unsigned int int_flags; struct task_struct *submitter_task; struct io_rings *rings; -- cgit v1.2.3 From 9165dc4fa969b64c2d4396ee4e1546a719978dd1 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Mon, 2 Mar 2026 10:29:10 -0700 Subject: io_uring: add REQ_F_IOPOLL A subsequent commit will allow uring_cmds to files that don't implement ->uring_cmd_iopoll() to be issued to IORING_SETUP_IOPOLL io_urings. This means the ctx's IORING_SETUP_IOPOLL flag isn't sufficient to determine whether a given request needs to be iopolled. Introduce a request flag REQ_F_IOPOLL set in ->issue() if a request needs to be iopolled to completion. Set the flag in io_rw_init_file() and io_uring_cmd() for requests issued to IORING_SETUP_IOPOLL ctxs. Use the request flag instead of IORING_SETUP_IOPOLL in places dealing with a specific request. A future possibility would be to add an option to enable/disable iopoll in the io_uring SQE instead of determining it from IORING_SETUP_IOPOLL. Signed-off-by: Caleb Sander Mateos Reviewed-by: Kanchan Joshi Reviewed-by: Anuj Gupta Link: https://patch.msgid.link/20260302172914.2488599-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/io_uring.c | 9 ++++----- io_uring/rw.c | 11 ++++++----- io_uring/uring_cmd.c | 5 +++-- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0b3f08adc217..4dbd7083dd54 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -550,6 +550,7 @@ enum { REQ_F_HAS_METADATA_BIT, REQ_F_IMPORT_BUFFER_BIT, REQ_F_SQE_COPIED_BIT, + REQ_F_IOPOLL_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -641,6 +642,8 @@ enum { REQ_F_IMPORT_BUFFER = IO_REQ_FLAG(REQ_F_IMPORT_BUFFER_BIT), /* ->sqe_copy() has been called, if necessary */ REQ_F_SQE_COPIED = IO_REQ_FLAG(REQ_F_SQE_COPIED_BIT), + /* request must be iopolled to completion (set in ->issue()) */ + REQ_F_IOPOLL = IO_REQ_FLAG(REQ_F_IOPOLL_BIT), }; struct io_tw_req { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb5a263706be..a610eaa5fd7c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -356,7 +356,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_ring_ctx *ctx = req->ctx; if (!(req->flags & REQ_F_CREDS)) { req->flags |= REQ_F_CREDS; @@ -378,7 +377,7 @@ static void io_prep_async_work(struct io_kiocb *req) if (should_hash && (req->file->f_flags & O_DIRECT) && (req->file->f_op->fop_flags & FOP_DIO_PARALLEL_WRITE)) should_hash = false; - if (should_hash || (ctx->flags & IORING_SETUP_IOPOLL)) + if (should_hash || (req->flags & REQ_F_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); } else if (!req->file || !S_ISBLK(file_inode(req->file)->i_mode)) { if (def->unbound_nonreg_file) @@ -1419,7 +1418,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = 0; /* If the op doesn't have a file, we're not polling for it */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) + if ((req->flags & REQ_F_IOPOLL) && def->iopoll_queue) io_iopoll_req_issued(req, issue_flags); } return ret; @@ -1435,7 +1434,7 @@ int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) io_tw_lock(req->ctx, tw); WARN_ON_ONCE(!req->file); - if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (WARN_ON_ONCE(req->flags & REQ_F_IOPOLL)) return -EFAULT; ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); @@ -1533,7 +1532,7 @@ fail: * wait for request slots on the block side. */ if (!needs_poll) { - if (!(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!(req->flags & REQ_F_IOPOLL)) break; if (io_wq_worker_stopped()) break; diff --git a/io_uring/rw.c b/io_uring/rw.c index 1a5f262734e8..3bdb9914e673 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -504,7 +504,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || (io_wq_current_is_worker() && - !(ctx->flags & IORING_SETUP_IOPOLL))) + !(req->flags & REQ_F_IOPOLL))) return false; /* * If ref is dying, we might be running poll reap from the exit work. @@ -640,7 +640,7 @@ static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) } } - if (req->ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) io_complete_rw_iopoll(&rw->kiocb, ret); else io_complete_rw(&rw->kiocb, ret); @@ -654,7 +654,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + if (ret >= 0 && !(req->flags & REQ_F_IOPOLL)) { u32 cflags = 0; __io_complete_rw_common(req, ret); @@ -876,6 +876,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; req->iopoll_completed = 0; @@ -963,7 +964,7 @@ static int __io_read(struct io_kiocb *req, struct io_br_sel *sel, if (io_file_can_poll(req)) return -EAGAIN; /* IOPOLL retry should happen for io-wq threads */ - if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (!force_nonblock && !(req->flags & REQ_F_IOPOLL)) goto done; /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) @@ -1188,7 +1189,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) + if (ret2 == -EAGAIN && (req->flags & REQ_F_IOPOLL)) goto ret_eagain; if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) { diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index ee7b49f47cb5..b651c63f6e20 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -110,7 +110,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, * because iopoll completion data overlaps with the hash_node used * for tracking. */ - if (ctx->flags & IORING_SETUP_IOPOLL) + if (req->flags & REQ_F_IOPOLL) return; if (!(cmd->flags & IORING_URING_CMD_CANCELABLE)) { @@ -167,7 +167,7 @@ void __io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret, u64 res2, io_req_set_cqe32_extra(req, res2, 0); } io_req_uring_cleanup(req, issue_flags); - if (req->ctx->flags & IORING_SETUP_IOPOLL) { + if (req->flags & REQ_F_IOPOLL) { /* order with io_iopoll_req_issued() checking ->iopoll_complete */ smp_store_release(&req->iopoll_completed, 1); } else if (issue_flags & IO_URING_F_COMPLETE_DEFER) { @@ -260,6 +260,7 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; + req->flags |= REQ_F_IOPOLL; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { -- cgit v1.2.3 From 033af2b3eb19c5ed96825572105bca3611635ada Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:38 +0000 Subject: io_uring: introduce callback driven main loop The io_uring_enter() has a fixed order of execution: it submits requests, waits for completions, and returns to the user. Allow to optionally replace it with a custom loop driven by a callback called loop_step. The basic requirements to the callback is that it should be able to submit requests, wait for completions, parse them and repeat. Most of the communication including parameter passing can be implemented via shared memory. The callback should return IOU_LOOP_CONTINUE to continue execution or IOU_LOOP_STOP to return to the user space. Note that the kernel may decide to prematurely terminate it as well, e.g. in case the process was signalled or killed. The hook takes a structure with parameters. It can be used to ask the kernel to wait for CQEs by setting cq_wait_idx to the CQE index it wants to wait for. Spurious wake ups are possible and even likely, the callback is expected to handle it. There will be more parameters in the future like timeout. It can be used with kernel callbacks, for example, as a slow path deprecation mechanism overwiting SQEs and emulating the wanted behaviour, however it's more useful together with BPF programs implemented in following patches. Note that keeping it separately from the normal io_uring wait loop makes things much simpler and cleaner. It keeps it in one place instead of spreading a bunch of checks in different places including disabling the submission path. It holds the lock by default, which is a better fit for BPF synchronisation and the loop execution model. It nicely avoids existing quirks like forced wake ups on timeout request completion. And it should be easier to implement new features. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/a2d369aa1c9dd23ad7edac9220cffc563abcaed6.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/Makefile | 2 +- io_uring/io_uring.c | 11 +++++ io_uring/loop.c | 91 ++++++++++++++++++++++++++++++++++++++++++ io_uring/loop.h | 27 +++++++++++++ io_uring/wait.h | 1 + 6 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 io_uring/loop.c create mode 100644 io_uring/loop.h (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4dbd7083dd54..344b634b8989 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -41,6 +41,8 @@ enum io_uring_cmd_flags { IO_URING_F_COMPAT = (1 << 12), }; +struct iou_loop_params; + struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -361,6 +363,9 @@ struct io_ring_ctx { struct io_alloc_cache rw_cache; struct io_alloc_cache cmd_cache; + int (*loop_step)(struct io_ring_ctx *ctx, + struct iou_loop_params *); + /* * Any cancelable uring_cmd is added to this list in * ->uring_cmd() by io_uring_cmd_insert_cancelable() diff --git a/io_uring/Makefile b/io_uring/Makefile index 931f9156132a..1c1f47de32a4 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ advise.o openclose.o statx.o timeout.o \ cancel.o waitid.o register.o \ truncate.o memmap.o alloc_cache.o \ - query.o + query.o loop.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74cd62b44d94..960d36c49ffe 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -95,6 +95,7 @@ #include "eventfd.h" #include "wait.h" #include "bpf_filter.h" +#include "loop.h" #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) @@ -588,6 +589,11 @@ void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx) +{ + __io_cqring_overflow_flush(ctx, false); +} + /* must to be called somewhat shortly after putting a request */ static inline void io_put_task(struct io_kiocb *req) { @@ -2571,6 +2577,11 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(smp_load_acquire(&ctx->flags) & IORING_SETUP_R_DISABLED)) goto out; + if (io_has_loop_ops(ctx)) { + ret = io_run_loop(ctx); + goto out; + } + /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if diff --git a/io_uring/loop.c b/io_uring/loop.c new file mode 100644 index 000000000000..31843cc3e451 --- /dev/null +++ b/io_uring/loop.c @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include "io_uring.h" +#include "wait.h" +#include "loop.h" + +static inline int io_loop_nr_cqes(const struct io_ring_ctx *ctx, + const struct iou_loop_params *lp) +{ + return lp->cq_wait_idx - READ_ONCE(ctx->rings->cq.tail); +} + +static inline void io_loop_wait_start(struct io_ring_ctx *ctx, unsigned nr_wait) +{ + atomic_set(&ctx->cq_wait_nr, nr_wait); + set_current_state(TASK_INTERRUPTIBLE); +} + +static inline void io_loop_wait_finish(struct io_ring_ctx *ctx) +{ + __set_current_state(TASK_RUNNING); + atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); +} + +static void io_loop_wait(struct io_ring_ctx *ctx, struct iou_loop_params *lp, + unsigned nr_wait) +{ + io_loop_wait_start(ctx, nr_wait); + + if (unlikely(io_local_work_pending(ctx) || + io_loop_nr_cqes(ctx, lp) <= 0) || + READ_ONCE(ctx->check_cq)) { + io_loop_wait_finish(ctx); + return; + } + + mutex_unlock(&ctx->uring_lock); + schedule(); + io_loop_wait_finish(ctx); + mutex_lock(&ctx->uring_lock); +} + +static int __io_run_loop(struct io_ring_ctx *ctx) +{ + struct iou_loop_params lp = {}; + + while (true) { + int nr_wait, step_res; + + if (unlikely(!ctx->loop_step)) + return -EFAULT; + + step_res = ctx->loop_step(ctx, &lp); + if (step_res == IOU_LOOP_STOP) + break; + if (step_res != IOU_LOOP_CONTINUE) + return -EINVAL; + + nr_wait = io_loop_nr_cqes(ctx, &lp); + if (nr_wait > 0) + io_loop_wait(ctx, &lp, nr_wait); + else + nr_wait = 0; + + if (task_work_pending(current)) { + mutex_unlock(&ctx->uring_lock); + io_run_task_work(); + mutex_lock(&ctx->uring_lock); + } + if (unlikely(task_sigpending(current))) + return -EINTR; + io_run_local_work_locked(ctx, nr_wait); + + if (READ_ONCE(ctx->check_cq) & BIT(IO_CHECK_CQ_OVERFLOW_BIT)) + io_cqring_overflow_flush_locked(ctx); + } + + return 0; +} + +int io_run_loop(struct io_ring_ctx *ctx) +{ + int ret; + + if (!io_allowed_run_tw(ctx)) + return -EEXIST; + + mutex_lock(&ctx->uring_lock); + ret = __io_run_loop(ctx); + mutex_unlock(&ctx->uring_lock); + return ret; +} diff --git a/io_uring/loop.h b/io_uring/loop.h new file mode 100644 index 000000000000..d7718b9ce61e --- /dev/null +++ b/io_uring/loop.h @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_LOOP_H +#define IOU_LOOP_H + +#include + +struct iou_loop_params { + /* + * The CQE index to wait for. Only serves as a hint and can still be + * woken up earlier. + */ + __u32 cq_wait_idx; +}; + +enum { + IOU_LOOP_CONTINUE = 0, + IOU_LOOP_STOP, +}; + +static inline bool io_has_loop_ops(struct io_ring_ctx *ctx) +{ + return data_race(ctx->loop_step); +} + +int io_run_loop(struct io_ring_ctx *ctx); + +#endif diff --git a/io_uring/wait.h b/io_uring/wait.h index 5e236f74e1af..037e512dd80c 100644 --- a/io_uring/wait.h +++ b/io_uring/wait.h @@ -25,6 +25,7 @@ int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, struct ext_arg *ext_arg); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx); +void io_cqring_overflow_flush_locked(struct io_ring_ctx *ctx); static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx) { -- cgit v1.2.3 From 98f37634b12b17ad5c56db8fb63cf9d7dc55d74c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 26 Feb 2026 12:48:41 +0000 Subject: io_uring/bpf-ops: implement bpf ops registration Implement BPF struct ops registration. It's registered off the BPF path, and can be removed by BPF as well as io_uring. To protect it, introduce a global lock synchronising registration. ctx->uring_lock can be nested under it. ctx->bpf_ops is write protected by both locks and so it's safe to read it under either of them. Signed-off-by: Pavel Begunkov Link: https://patch.msgid.link/1f46bffd76008de49cbafa2ad77d348810a4f69e.1772109579.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++ io_uring/bpf-ops.c | 92 +++++++++++++++++++++++++++++++++++++++++- io_uring/bpf-ops.h | 8 ++++ io_uring/io_uring.c | 1 + 4 files changed, 104 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 344b634b8989..28e5dbdac55b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -8,6 +8,9 @@ #include #include +struct iou_loop_params; +struct io_uring_bpf_ops; + enum { /* * A hint to not wake right away but delay until there are enough of @@ -488,6 +491,8 @@ struct io_ring_ctx { DECLARE_HASHTABLE(napi_ht, 4); #endif + struct io_uring_bpf_ops *bpf_ops; + /* * Protection for resize vs mmap races - both the mmap and resize * side will need to grab this lock, to prevent either side from diff --git a/io_uring/bpf-ops.c b/io_uring/bpf-ops.c index 17518f4ecca9..e4b244337aa9 100644 --- a/io_uring/bpf-ops.c +++ b/io_uring/bpf-ops.c @@ -5,10 +5,11 @@ #include "io_uring.h" #include "register.h" +#include "loop.h" #include "memmap.h" #include "bpf-ops.h" -#include "loop.h" +static DEFINE_MUTEX(io_bpf_ctrl_mutex); static const struct btf_type *loop_params_type; __bpf_kfunc_start_defs(); @@ -143,16 +144,103 @@ static int bpf_io_init_member(const struct btf_type *t, const struct btf_member *member, void *kdata, const void *udata) { + u32 moff = __btf_member_bit_offset(t, member) / 8; + const struct io_uring_bpf_ops *uops = udata; + struct io_uring_bpf_ops *ops = kdata; + + switch (moff) { + case offsetof(struct io_uring_bpf_ops, ring_fd): + ops->ring_fd = uops->ring_fd; + return 1; + } + return 0; +} + +static int io_install_bpf(struct io_ring_ctx *ctx, struct io_uring_bpf_ops *ops) +{ + if (ctx->flags & (IORING_SETUP_SQPOLL | IORING_SETUP_IOPOLL)) + return -EOPNOTSUPP; + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EOPNOTSUPP; + + if (ctx->bpf_ops) + return -EBUSY; + if (WARN_ON_ONCE(!ops->loop_step)) + return -EINVAL; + + ops->priv = ctx; + ctx->bpf_ops = ops; + ctx->loop_step = ops->loop_step; return 0; } static int bpf_io_reg(void *kdata, struct bpf_link *link) { - return -EOPNOTSUPP; + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + struct file *file; + int ret = -EBUSY; + + file = io_uring_register_get_file(ops->ring_fd, false); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + scoped_guard(mutex, &io_bpf_ctrl_mutex) { + guard(mutex)(&ctx->uring_lock); + ret = io_install_bpf(ctx, ops); + } + + fput(file); + return ret; +} + +static void io_eject_bpf(struct io_ring_ctx *ctx) +{ + struct io_uring_bpf_ops *ops = ctx->bpf_ops; + + if (WARN_ON_ONCE(!ops)) + return; + if (WARN_ON_ONCE(ops->priv != ctx)) + return; + + ops->priv = NULL; + ctx->bpf_ops = NULL; + ctx->loop_step = NULL; } static void bpf_io_unreg(void *kdata, struct bpf_link *link) { + struct io_uring_bpf_ops *ops = kdata; + struct io_ring_ctx *ctx; + + guard(mutex)(&io_bpf_ctrl_mutex); + ctx = ops->priv; + if (ctx) { + guard(mutex)(&ctx->uring_lock); + if (WARN_ON_ONCE(ctx->bpf_ops != ops)) + return; + + io_eject_bpf(ctx); + } +} + +void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ + /* + * ->bpf_ops is write protected by io_bpf_ctrl_mutex and uring_lock, + * and read protected by either. Try to avoid taking the global lock + * for rings that never had any bpf installed. + */ + scoped_guard(mutex, &ctx->uring_lock) { + if (!ctx->bpf_ops) + return; + } + + guard(mutex)(&io_bpf_ctrl_mutex); + guard(mutex)(&ctx->uring_lock); + if (ctx->bpf_ops) + io_eject_bpf(ctx); } static struct bpf_struct_ops bpf_ring_ops = { diff --git a/io_uring/bpf-ops.h b/io_uring/bpf-ops.h index b9e589ad519a..b39b3fd3acda 100644 --- a/io_uring/bpf-ops.h +++ b/io_uring/bpf-ops.h @@ -17,4 +17,12 @@ struct io_uring_bpf_ops { void *priv; }; +#ifdef CONFIG_IO_URING_BPF_OPS +void io_unregister_bpf_ops(struct io_ring_ctx *ctx); +#else +static inline void io_unregister_bpf_ops(struct io_ring_ctx *ctx) +{ +} +#endif + #endif /* IOU_BPF_OPS_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 0a80c8e6e633..d703f0a8b315 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2148,6 +2148,7 @@ static __cold void io_req_caches_free(struct io_ring_ctx *ctx) static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { + io_unregister_bpf_ops(ctx); io_sq_thread_finish(ctx); mutex_lock(&ctx->uring_lock); -- cgit v1.2.3