From 13ee854e7c04236a47a5beaacdcf51eb0bc7a8fa Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 5 Feb 2025 11:36:46 +0000 Subject: io_uring/kbuf: remove legacy kbuf caching Remove all struct io_buffer caches. It makes it a fair bit simpler. Apart from from killing a bunch of lines and juggling between lists, __io_put_kbuf_list() doesn't need ->completion_lock locking now. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/18287217466ee2576ea0b1e72daccf7b22c7e856.1738724373.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3def525a1da3..e2fef264ff8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -360,7 +360,6 @@ struct io_ring_ctx { spinlock_t completion_lock; - struct list_head io_buffers_comp; struct list_head cq_overflow_list; struct hlist_head waitid_list; @@ -379,8 +378,6 @@ struct io_ring_ctx { unsigned int file_alloc_start; unsigned int file_alloc_end; - struct list_head io_buffers_cache; - /* Keep this last, we don't need it for the fast path */ struct wait_queue_head poll_wq; struct io_restriction restrictions; -- cgit v1.2.3 From bcf8a0293a019bb0c4aebafdebe9a1e7a923249a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:04 -0700 Subject: io_uring: introduce type alias for io_tw_state In preparation for changing how io_tw_state is passed, introduce a type alias io_tw_token_t for struct io_tw_state *. This allows for changing the representation in one place, without having to update the many functions that just forward their struct io_tw_state * argument. Also add a comment to struct io_tw_state to explain its purpose. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 9 ++++++++- io_uring/futex.c | 16 ++++++++-------- io_uring/io_uring.c | 28 ++++++++++++++-------------- io_uring/io_uring.h | 8 ++++---- io_uring/msg_ring.c | 2 +- io_uring/notif.c | 4 ++-- io_uring/poll.c | 18 +++++++++--------- io_uring/poll.h | 4 +++- io_uring/rw.c | 4 ++-- io_uring/rw.h | 3 ++- io_uring/timeout.c | 16 ++++++++-------- io_uring/uring_cmd.c | 2 +- io_uring/waitid.c | 8 ++++---- 13 files changed, 66 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index e2fef264ff8b..ea4694ee9d19 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -436,8 +436,15 @@ struct io_ring_ctx { struct io_mapped_region param_region; }; +/* + * Token indicating function is called in task work context: + * ctx->uring_lock is held and any completions generated will be flushed. + * ONLY core io_uring.c should instantiate this struct. + */ struct io_tw_state { }; +/* Alias to use in code that doesn't instantiate struct io_tw_state */ +typedef struct io_tw_state *io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, @@ -563,7 +570,7 @@ enum { REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, io_tw_token_t tw); struct io_task_work { struct llist_node node; diff --git a/io_uring/futex.c b/io_uring/futex.c index ede6279cadc6..b7581766406c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -44,30 +44,30 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { req->async_data = NULL; hlist_del_init(&req->hash_node); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } -static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) kfree(ifd); - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } -static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; @@ -79,7 +79,7 @@ static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } static bool io_futexv_claim(struct io_futex *iof) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 4a0944a57d96..b44ff8871725 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -543,7 +543,7 @@ static void io_queue_iowq(struct io_kiocb *req) io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) { io_queue_iowq(req); } @@ -1022,7 +1022,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; @@ -1277,7 +1277,7 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, } static int __io_run_local_work_loop(struct llist_node **node, - struct io_tw_state *ts, + io_tw_token_t tw, int events) { int ret = 0; @@ -1288,7 +1288,7 @@ static int __io_run_local_work_loop(struct llist_node **node, io_task_work.node); INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, ts); + req, tw); *node = next; if (++ret >= events) break; @@ -1297,7 +1297,7 @@ static int __io_run_local_work_loop(struct llist_node **node, return ret; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, int min_events, int max_events) { struct llist_node *node; @@ -1310,7 +1310,7 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: min_events -= ret; - ret = __io_run_local_work_loop(&ctx->retry_llist.first, ts, max_events); + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); if (ctx->retry_llist.first) goto retry_done; @@ -1319,7 +1319,7 @@ again: * running the pending items. */ node = llist_reverse_order(llist_del_all(&ctx->work_llist)); - ret += __io_run_local_work_loop(&node, ts, max_events - ret); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); ctx->retry_llist.first = node; loops++; @@ -1357,15 +1357,15 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, return ret; } -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) @@ -1583,7 +1583,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) { io_req_complete_defer(req); } @@ -1763,9 +1763,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 85bc8f76ca19..6c46d9cdd7aa 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -90,9 +90,9 @@ void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, unsigned flags); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); @@ -104,7 +104,7 @@ int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -376,7 +376,7 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || io_local_work_pending(ctx); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) { lockdep_assert_held(&ctx->uring_lock); } diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7e6f68e911f1..0bbcbbcdebfd 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -71,7 +71,7 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_ring_ctx *ctx = req->ctx; diff --git a/io_uring/notif.c b/io_uring/notif.c index ee3a33510b3c..7bd92538dccb 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,7 +11,7 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) { struct io_notif_data *nd = io_notif_to_data(notif); @@ -29,7 +29,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) } nd = nd->next; - io_req_task_complete(notif, ts); + io_req_task_complete(notif, tw); } while (nd); } diff --git a/io_uring/poll.c b/io_uring/poll.c index bb1c0cd4f809..176854882ba6 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -220,7 +220,7 @@ static inline void io_poll_execute(struct io_kiocb *req, int res) * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) +static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; @@ -288,7 +288,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, ts); + int ret = io_poll_issue(req, tw); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) @@ -311,11 +311,11 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; - ret = io_poll_check_events(req, ts); + ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { io_kbuf_recycle(req, 0); return; @@ -335,7 +335,7 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -343,14 +343,14 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } else { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } diff --git a/io_uring/poll.h b/io_uring/poll.h index 04ede93113dc..27e2db2ed4ae 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include + #define IO_POLL_ALLOC_CACHE_MAX 32 enum { @@ -43,4 +45,4 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); diff --git a/io_uring/rw.c b/io_uring/rw.c index 7aa1e4c9f64a..16f12f94943f 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -511,7 +511,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -528,7 +528,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/rw.h b/io_uring/rw.h index eaa59bd64870..a45e0c71b59d 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include struct io_meta_state { @@ -39,7 +40,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 48fc8cf70784..fec6ec7beb62 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -65,7 +65,7 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; @@ -82,7 +82,7 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) } } - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) @@ -154,9 +154,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) +static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) { - io_tw_lock(link->ctx, ts); + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -165,7 +165,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, ts); + io_req_task_complete(link, tw); link = nxt; } } @@ -312,7 +312,7 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; @@ -330,11 +330,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index e6701b7aa147..8bdf2c9b3fef 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -102,7 +102,7 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) +static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); unsigned int flags = IO_URING_F_COMPLETE_DEFER; diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 5c443e5f6d92..347b8f53efa7 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -185,13 +185,13 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); @@ -221,7 +221,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, -- cgit v1.2.3 From 94a4274bb6ebc5b4293559304d0f00928de0d8c0 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Sun, 16 Feb 2025 19:25:05 -0700 Subject: io_uring: pass struct io_tw_state by value 8e5b3b89ecaf ("io_uring: remove struct io_tw_state::locked") removed the only field of io_tw_state but kept it as a task work callback argument to "forc[e] users not to invoke them carelessly out of a wrong context". Passing the struct io_tw_state * argument adds a few instructions to all callers that can't inline the functions and see the argument is unused. So pass struct io_tw_state by value instead. Since it's a 0-sized value, it can be passed without any instructions needed to initialize it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250217022511.1150145-2-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index ea4694ee9d19..123e69368730 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -444,7 +444,7 @@ struct io_ring_ctx { struct io_tw_state { }; /* Alias to use in code that doesn't instantiate struct io_tw_state */ -typedef struct io_tw_state *io_tw_token_t; +typedef struct io_tw_state io_tw_token_t; enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b44ff8871725..b688953d1de8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -255,7 +255,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); + req->io_task_work.func(req, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -1052,24 +1052,24 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, &ts); + req, ts); node = next; (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); return node; } @@ -1341,7 +1341,7 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events, + return __io_run_local_work(ctx, ts, min_events, max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } @@ -1352,7 +1352,7 @@ static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events, max_events); + ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } -- cgit v1.2.3 From 69d483d5f43e7a525246090c80f978b827104ad4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 24 Feb 2025 13:31:09 -0800 Subject: io_uring/nvme: pass issue_flags to io_uring_cmd_import_fixed() io_uring_cmd_import_fixed() will need to know the io_uring execution state in following commits, for now just pass issue_flags into it without actually using. Reviewed-by: Keith Busch Signed-off-by: Pavel Begunkov Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250224213116.3509093-5-kbusch@meta.com Signed-off-by: Jens Axboe --- drivers/nvme/host/ioctl.c | 10 ++++++---- include/linux/io_uring/cmd.h | 6 ++++-- io_uring/uring_cmd.c | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c index e8930146847a..e0876bc9aacd 100644 --- a/drivers/nvme/host/ioctl.c +++ b/drivers/nvme/host/ioctl.c @@ -114,7 +114,8 @@ static struct request *nvme_alloc_user_request(struct request_queue *q, static int nvme_map_user_request(struct request *req, u64 ubuffer, unsigned bufflen, void __user *meta_buffer, unsigned meta_len, - struct io_uring_cmd *ioucmd, unsigned int flags) + struct io_uring_cmd *ioucmd, unsigned int flags, + unsigned int iou_issue_flags) { struct request_queue *q = req->q; struct nvme_ns *ns = q->queuedata; @@ -142,7 +143,8 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer, if (WARN_ON_ONCE(flags & NVME_IOCTL_VEC)) return -EINVAL; ret = io_uring_cmd_import_fixed(ubuffer, bufflen, - rq_data_dir(req), &iter, ioucmd); + rq_data_dir(req), &iter, ioucmd, + iou_issue_flags); if (ret < 0) goto out; ret = blk_rq_map_user_iov(q, req, NULL, &iter, GFP_KERNEL); @@ -194,7 +196,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, req->timeout = timeout; if (ubuffer && bufflen) { ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer, - meta_len, NULL, flags); + meta_len, NULL, flags, 0); if (ret) return ret; } @@ -514,7 +516,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (d.addr && d.data_len) { ret = nvme_map_user_request(req, d.addr, d.data_len, nvme_to_user_ptr(d.metadata), - d.metadata_len, ioucmd, vec); + d.metadata_len, ioucmd, vec, issue_flags); if (ret) return ret; } diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index abd0c8bd950b..87150dc0a07c 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -39,7 +39,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd); + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags); /* * Completes the request, i.e. posts an io_uring CQE and deallocates @ioucmd @@ -67,7 +68,8 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 14086a266461..28ed69c40756 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -257,7 +257,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, void *ioucmd, + unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); struct io_rsrc_node *node = req->buf_node; -- cgit v1.2.3 From 27cb27b6d5ea401143ca3648983342bb820c4be9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:14 -0800 Subject: io_uring: add support for kernel registered bvecs Provide an interface for the kernel to leverage the existing pre-registered buffers that io_uring provides. User space can reference these later to achieve zero-copy IO. User space must register an empty fixed buffer table with io_uring in order for the kernel to make use of it. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-5-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 7 +++ io_uring/io_uring.c | 3 ++ io_uring/rsrc.c | 123 ++++++++++++++++++++++++++++++++++++++++--- io_uring/rsrc.h | 9 ++++ io_uring/rw.c | 3 ++ 5 files changed, 138 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 87150dc0a07c..cf8d80d84734 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -4,6 +4,7 @@ #include #include +#include /* only top 8 bits of sqe->uring_cmd_flags for kernel internal use */ #define IORING_URING_CMD_CANCELABLE (1U << 30) @@ -125,4 +126,10 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur return cmd_to_io_kiocb(cmd)->async_data; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags); +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); + #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9f50f7e0b57e..f2bdb1eab577 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3940,6 +3940,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6cf7dba44d5b..bf4e2c7897e5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -101,17 +102,23 @@ static int io_buffer_validate(struct iovec *iov) return 0; } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static void io_release_ubuf(void *priv) { - struct io_mapped_ubuf *imu = node->buf; + struct io_mapped_ubuf *imu = priv; unsigned int i; - if (!refcount_dec_and_test(&imu->refs)) - return; for (i = 0; i < imu->nr_bvecs; i++) unpin_user_page(imu->bvec[i].bv_page); +} + +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (!refcount_dec_and_test(&imu->refs)) + return; + if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); kvfree(imu); } @@ -451,7 +458,7 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; case IORING_RSRC_BUFFER: if (node->buf) - io_buffer_unmap(ctx, node); + io_buffer_unmap(ctx, node->buf); break; default: WARN_ON_ONCE(1); @@ -761,6 +768,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->len = iov->iov_len; imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); @@ -857,6 +868,95 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + + if (op_is_write(req_op(rq))) + imu->dir = IO_IMU_SOURCE; + else + imu->dir = IO_IMU_DEST; + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); + +void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) + goto unlock; + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node || !node->buf->is_kbuf) + goto unlock; + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) @@ -871,6 +971,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; /* * Might not be a start of buffer, set size appropriately @@ -883,8 +985,8 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, /* * Don't use iov_iter_advance() here, as it's really slow for * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: * * 1) it's a BVEC iter, we set it up * 2) all bvecs are the same in size, except potentially the @@ -898,8 +1000,15 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, */ const struct bio_vec *bvec = imu->bvec; + /* + * Kernel buffer bvecs, on the other hand, don't necessarily + * have the size property of user registered ones, so we have + * to use the slow iter advance. + */ if (offset < bvec->bv_len) { iter->iov_offset = offset; + } else if (imu->is_kbuf) { + iov_iter_advance(iter, offset); } else { unsigned long seg_skip; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 0fc07897070c..6b61f3f8cce7 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -20,6 +20,11 @@ struct io_rsrc_node { }; }; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, +}; + struct io_mapped_ubuf { u64 ubuf; unsigned int len; @@ -27,6 +32,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + bool is_kbuf; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; diff --git a/io_uring/rw.c b/io_uring/rw.c index 7bc23802a388..5ee9f8949e8b 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -629,6 +629,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -644,6 +645,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); -- cgit v1.2.3 From 1f6540e2aabb7372e68223a3669019589c3e30ad Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:15 -0800 Subject: ublk: zc register/unregister bvec Provide new operations for the user to request mapping an active request to an io uring instance's buf_table. The user has to provide the index it wants to install the buffer. A reference count is taken on the request to ensure it can't be completed while it is active in a ring's buf_table. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-6-kbusch@meta.com Reviewed-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 59 ++++++++++++++++++++++++++++++++++++++----- include/uapi/linux/ublk_cmd.h | 4 +++ 2 files changed, 56 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 529085181f35..b5cf92baaf0f 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -51,6 +51,9 @@ /* private ioctl command mirror */ #define UBLK_CMD_DEL_DEV_ASYNC _IOC_NR(UBLK_U_CMD_DEL_DEV_ASYNC) +#define UBLK_IO_REGISTER_IO_BUF _IOC_NR(UBLK_U_IO_REGISTER_IO_BUF) +#define UBLK_IO_UNREGISTER_IO_BUF _IOC_NR(UBLK_U_IO_UNREGISTER_IO_BUF) + /* All UBLK_F_* have to be included into UBLK_F_ALL */ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ | UBLK_F_URING_CMD_COMP_IN_TASK \ @@ -196,12 +199,14 @@ struct ublk_params_header { static bool ublk_abort_requests(struct ublk_device *ub, struct ublk_queue *ubq); +static inline struct request *__ublk_check_and_get_req(struct ublk_device *ub, + struct ublk_queue *ubq, int tag, size_t offset); static inline unsigned int ublk_req_build_flags(struct request *req); static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, int tag); static inline bool ublk_dev_is_user_copy(const struct ublk_device *ub) { - return ub->dev_info.flags & UBLK_F_USER_COPY; + return ub->dev_info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_dev_is_zoned(const struct ublk_device *ub) @@ -581,7 +586,7 @@ static void ublk_apply_params(struct ublk_device *ub) static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) { - return ubq->flags & UBLK_F_USER_COPY; + return ubq->flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY); } static inline bool ublk_need_req_ref(const struct ublk_queue *ubq) @@ -1747,6 +1752,45 @@ static inline void ublk_prep_cancel(struct io_uring_cmd *cmd, io_uring_cmd_mark_cancelable(cmd, issue_flags); } +static void ublk_io_release(void *priv) +{ + struct request *rq = priv; + struct ublk_queue *ubq = rq->mq_hctx->driver_data; + + ublk_put_req_ref(ubq, rq); +} + +static int ublk_register_io_buf(struct io_uring_cmd *cmd, + struct ublk_queue *ubq, unsigned int tag, + const struct ublksrv_io_cmd *ub_cmd, + unsigned int issue_flags) +{ + struct ublk_device *ub = cmd->file->private_data; + int index = (int)ub_cmd->addr, ret; + struct request *req; + + req = __ublk_check_and_get_req(ub, ubq, tag, 0); + if (!req) + return -EINVAL; + + ret = io_buffer_register_bvec(cmd, req, ublk_io_release, index, + issue_flags); + if (ret) { + ublk_put_req_ref(ubq, req); + return ret; + } + + return 0; +} + +static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, + const struct ublksrv_io_cmd *ub_cmd, + unsigned int issue_flags) +{ + io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); + return 0; +} + static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags, const struct ublksrv_io_cmd *ub_cmd) @@ -1798,6 +1842,10 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, ret = -EINVAL; switch (_IOC_NR(cmd_op)) { + case UBLK_IO_REGISTER_IO_BUF: + return ublk_register_io_buf(cmd, ubq, tag, ub_cmd, issue_flags); + case UBLK_IO_UNREGISTER_IO_BUF: + return ublk_unregister_io_buf(cmd, ub_cmd, issue_flags); case UBLK_IO_FETCH_REQ: /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ if (ublk_queue_ready(ubq)) { @@ -2459,7 +2507,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) * buffer by pwrite() to ublk char device, which can't be * used for unprivileged device */ - if (info.flags & UBLK_F_USER_COPY) + if (info.flags & (UBLK_F_USER_COPY | UBLK_F_SUPPORT_ZERO_COPY)) return -EINVAL; } @@ -2527,9 +2575,6 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) goto out_free_dev_number; } - /* We are not ready to support zero copy */ - ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; - ub->dev_info.nr_hw_queues = min_t(unsigned int, ub->dev_info.nr_hw_queues, nr_cpu_ids); ublk_align_max_io_size(ub); @@ -2860,7 +2905,7 @@ static int ublk_ctrl_get_features(struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); void __user *argp = (void __user *)(unsigned long)header->addr; - u64 features = UBLK_F_ALL & ~UBLK_F_SUPPORT_ZERO_COPY; + u64 features = UBLK_F_ALL; if (header->len != UBLK_FEATURES_LEN || !header->addr) return -EINVAL; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index a8bc98bb69fc..74246c926b55 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -94,6 +94,10 @@ _IOWR('u', UBLK_IO_COMMIT_AND_FETCH_REQ, struct ublksrv_io_cmd) #define UBLK_U_IO_NEED_GET_DATA \ _IOWR('u', UBLK_IO_NEED_GET_DATA, struct ublksrv_io_cmd) +#define UBLK_U_IO_REGISTER_IO_BUF \ + _IOWR('u', 0x23, struct ublksrv_io_cmd) +#define UBLK_U_IO_UNREGISTER_IO_BUF \ + _IOWR('u', 0x24, struct ublksrv_io_cmd) /* only ABORT means that no re-fetch */ #define UBLK_IO_RES_OK 0 -- cgit v1.2.3 From ed9f3112a8a8f6e6919d3b9da2651fa302df7be3 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 14:39:16 -0800 Subject: io_uring: cache nodes and mapped buffers Frequent alloc/free cycles on these is pretty costly. Use an io cache to more efficiently reuse these buffers. Signed-off-by: Keith Busch Link: https://lore.kernel.org/r/20250227223916.143006-7-kbusch@meta.com [axboe: fix imu leak] Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 ++ io_uring/filetable.c | 2 +- io_uring/io_uring.c | 2 ++ io_uring/rsrc.c | 71 +++++++++++++++++++++++++++++++++--------- io_uring/rsrc.h | 4 ++- 5 files changed, 65 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 123e69368730..432c98ff52ee 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -292,6 +292,8 @@ struct io_ring_ctx { struct io_file_table file_table; struct io_rsrc_data buf_table; + struct io_alloc_cache node_cache; + struct io_alloc_cache imu_cache; struct io_submit_state submit_state; diff --git a/io_uring/filetable.c b/io_uring/filetable.c index dd8eeec97acf..a21660e3145a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -68,7 +68,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) return -ENOMEM; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f2bdb1eab577..ccc343f61a57 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -291,6 +291,7 @@ static void io_free_alloc_caches(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->uring_cache, kfree); io_alloc_cache_free(&ctx->msg_cache, kfree); io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); } static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) @@ -338,6 +339,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index bf4e2c7897e5..45bfb37bca1e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -33,6 +33,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) +#define IO_CACHED_BVECS_SEGS 32 + int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -111,6 +113,22 @@ static void io_release_ubuf(void *priv) unpin_user_page(imu->bvec[i].bv_page); } +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) +{ + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), + GFP_KERNEL); +} + +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) +{ + if (imu->nr_bvecs > IO_CACHED_BVECS_SEGS || + !io_alloc_cache_put(&ctx->imu_cache, imu)) + kvfree(imu); +} + static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { if (!refcount_dec_and_test(&imu->refs)) @@ -119,22 +137,45 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); imu->release(imu->priv); - kvfree(imu); + io_free_imu(ctx, imu); } -struct io_rsrc_node *io_rsrc_node_alloc(int type) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { struct io_rsrc_node *node; - node = kzalloc(sizeof(*node), GFP_KERNEL); + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); if (node) { node->type = type; node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } return node; } -__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) +{ + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; +} + +void io_rsrc_cache_free(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kfree); +} + +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { if (!data->nr) return; @@ -207,7 +248,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { err = -ENOMEM; fput(file); @@ -465,7 +506,8 @@ void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) break; } - kfree(node); + if (!io_alloc_cache_put(&ctx->node_cache, node)) + kvfree(node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) @@ -527,7 +569,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, goto fail; } ret = -ENOMEM; - node = io_rsrc_node_alloc(IORING_RSRC_FILE); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); if (!node) { fput(file); goto fail; @@ -732,7 +774,7 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (!iov->iov_base) return NULL; - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) return ERR_PTR(-ENOMEM); node->buf = NULL; @@ -752,10 +794,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); if (ret) { unpin_user_pages(pages, nr_pages); @@ -766,7 +809,6 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; imu->release = io_release_ubuf; imu->priv = imu; @@ -789,7 +831,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, } done: if (ret) { - kvfree(imu); + if (imu) + io_free_imu(ctx, imu); if (node) io_put_rsrc_node(ctx, node); node = ERR_PTR(ret); @@ -893,14 +936,14 @@ int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, goto unlock; } - node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!node) { ret = -ENOMEM; goto unlock; } nr_bvecs = blk_rq_nr_phys_segments(rq); - imu = kvmalloc(struct_size(imu, bvec, nr_bvecs), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_bvecs); if (!imu) { kfree(node); ret = -ENOMEM; @@ -1137,7 +1180,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx if (!src_node) { dst_node = NULL; } else { - dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER); + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); if (!dst_node) { ret = -ENOMEM; goto out_free; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 6b61f3f8cce7..6fe7b9e615bf 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -48,7 +48,9 @@ struct io_imu_folio_data { unsigned int nr_folios; }; -struct io_rsrc_node *io_rsrc_node_alloc(int type); +bool io_rsrc_cache_init(struct io_ring_ctx *ctx); +void io_rsrc_cache_free(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); -- cgit v1.2.3 From 0c542a69cbcd1fefad32c59cea7a80413fe60922 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 15:15:13 -0700 Subject: io_uring/uring_cmd: specify io_uring_cmd_import_fixed() pointer type io_uring_cmd_import_fixed() takes a struct io_uring_cmd *, but the type of the ioucmd parameter is void *. Make the pointer type explicit so the compiler can type check it. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228221514.604350-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring/cmd.h | 10 ++++++---- io_uring/uring_cmd.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index cf8d80d84734..5bc4f0d58506 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -40,7 +40,8 @@ static inline void io_uring_cmd_private_sz_check(size_t cmd_sz) #if defined(CONFIG_IO_URING) int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, unsigned int issue_flags); /* @@ -68,9 +69,10 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd); #else -static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, - unsigned int issue_flags) +static inline int +io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, + struct iov_iter *iter, struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { return -EOPNOTSUPP; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 31d5e0948af1..de39b602aa82 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -245,7 +245,8 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd, + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); -- cgit v1.2.3 From 09fdd35162c289f354326a55d552a8858f6e8072 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:03:04 -0700 Subject: io_uring: convert cmd_to_io_kiocb() macro to function The cmd_to_io_kiocb() macro applies a pointer cast to its input without parenthesizing it. Currently all inputs are variable names, so this has the intended effect. But since casts have relatively high precedence, the macro would apply the cast to the wrong value if the input was a pointer addition, for example. Turn the macro into a static inline function to ensure the pointer cast is applied to the full input value. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228230305.630885-1-csander@purestorage.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 432c98ff52ee..72aac84dca93 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -607,7 +607,11 @@ static inline void io_kiocb_cmd_sz_check(size_t cmd_sz) io_kiocb_cmd_sz_check(sizeof(cmd_type)) , \ ((cmd_type *)&(req)->cmd) \ ) -#define cmd_to_io_kiocb(ptr) ((struct io_kiocb *) ptr) + +static inline struct io_kiocb *cmd_to_io_kiocb(void *ptr) +{ + return ptr; +} struct io_kiocb { union { -- cgit v1.2.3 From e6ea7ec494881bcf61b8f0f77f7cb3542f717ff2 Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Fri, 28 Feb 2025 16:14:31 -0700 Subject: io_uring/ublk: report error when unregister operation fails Indicate to userspace applications if a UBLK_IO_UNREGISTER_IO_BUF command specifies an invalid buffer index by returning an error code. Return -EINVAL if no buffer is registered with the given index, and -EBUSY if the registered buffer is not a kernel bvec. Signed-off-by: Caleb Sander Mateos Link: https://lore.kernel.org/r/20250228231432.642417-1-csander@purestorage.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 +-- include/linux/io_uring/cmd.h | 4 ++-- io_uring/rsrc.c | 18 ++++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index b5cf92baaf0f..512cbd456817 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1787,8 +1787,7 @@ static int ublk_unregister_io_buf(struct io_uring_cmd *cmd, const struct ublksrv_io_cmd *ub_cmd, unsigned int issue_flags) { - io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); - return 0; + return io_buffer_unregister_bvec(cmd, ub_cmd->addr, issue_flags); } static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h index 5bc4f0d58506..598cacda4aa3 100644 --- a/include/linux/io_uring/cmd.h +++ b/include/linux/io_uring/cmd.h @@ -131,7 +131,7 @@ static inline struct io_uring_cmd_data *io_uring_cmd_get_async_data(struct io_ur int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, void (*release)(void *), unsigned int index, unsigned int issue_flags); -void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags); +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags); #endif /* _LINUX_IO_URING_CMD_H */ diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 3107a03d56b8..c9105030f0e3 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -973,26 +973,36 @@ unlock: } EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -void io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, - unsigned int issue_flags) +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) { struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; struct io_rsrc_data *data = &ctx->buf_table; struct io_rsrc_node *node; + int ret = 0; io_ring_submit_lock(ctx, issue_flags); - if (index >= data->nr) + if (index >= data->nr) { + ret = -EINVAL; goto unlock; + } index = array_index_nospec(index, data->nr); node = data->nodes[index]; - if (!node || !node->buf->is_kbuf) + if (!node) { + ret = -EINVAL; goto unlock; + } + if (!node->buf->is_kbuf) { + ret = -EBUSY; + goto unlock; + } io_put_rsrc_node(ctx, node); data->nodes[index] = NULL; unlock: io_ring_submit_unlock(ctx, issue_flags); + return ret; } EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); -- cgit v1.2.3