From d8aeb44a9ae324c4b823689fabb30b6621d93c88 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 7 Mar 2023 09:40:28 -0700 Subject: fs: add FMODE_DIO_PARALLEL_WRITE flag Some filesystems support multiple threads writing to the same file with O_DIRECT without requiring exclusive access to it. io_uring can use this hint to avoid serializing dio writes to this inode, instead allowing them to run in parallel. XFS and ext4 both fall into this category, so set the flag for both of them. Reviewed-by: Darrick J. Wong Signed-off-by: Jens Axboe --- include/linux/fs.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index c85916e9f7db..475d88640d3d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -168,6 +168,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define FMODE_NOREUSE ((__force fmode_t)0x800000) +/* File supports non-exclusive O_DIRECT writes from multiple threads */ +#define FMODE_DIO_PARALLEL_WRITE ((__force fmode_t)0x1000000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) -- cgit v1.2.3 From efba1a9e653e107577a48157b5424878c46f2285 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:52 -0800 Subject: io_uring: Move from hlist to io_wq_work_node Having cache entries linked using the hlist format brings no benefit, and also requires an unnecessary extra pointer address per cache entry. Use the internal io_wq_work_node single-linked list for the internal alloc caches (async_msghdr and async_poll) This is required to be able to use KASAN on cache entries, since we do not need to touch unused (and poisoned) cache entries when adding more entries to the list. Suggested-by: Pavel Begunkov Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/alloc_cache.h | 24 +++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 00689c12f6ab..757105ebd8af 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -188,7 +188,7 @@ struct io_ev_fd { }; struct io_alloc_cache { - struct hlist_head list; + struct io_wq_work_node list; unsigned int nr_cached; }; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index c2cde88aeed5..aaa838c31d92 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -7,7 +7,7 @@ #define IO_ALLOC_CACHE_MAX 512 struct io_cache_entry { - struct hlist_node node; + struct io_wq_work_node node; }; static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, @@ -15,7 +15,7 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, { if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { cache->nr_cached++; - hlist_add_head(&entry->node, &cache->list); + wq_stack_add_head(&entry->node, &cache->list); return true; } return false; @@ -23,12 +23,13 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { - if (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + if (cache->list.next) { + struct io_cache_entry *entry; - hlist_del(node); + entry = container_of(cache->list.next, struct io_cache_entry, node); + cache->list.next = cache->list.next->next; cache->nr_cached--; - return container_of(node, struct io_cache_entry, node); + return entry; } return NULL; @@ -36,18 +37,19 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c static inline void io_alloc_cache_init(struct io_alloc_cache *cache) { - INIT_HLIST_HEAD(&cache->list); + cache->list.next = NULL; cache->nr_cached = 0; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, void (*free)(struct io_cache_entry *)) { - while (!hlist_empty(&cache->list)) { - struct hlist_node *node = cache->list.first; + while (1) { + struct io_cache_entry *entry = io_alloc_cache_get(cache); - hlist_del(node); - free(container_of(node, struct io_cache_entry, node)); + if (!entry) + break; + free(entry); } cache->nr_cached = 0; } -- cgit v1.2.3 From e1fe7ee885dc0712e982ee465d9f8b96254c30c1 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 23 Feb 2023 08:43:53 -0800 Subject: io_uring: Add KASAN support for alloc_caches Add support for KASAN in the alloc_caches (apoll and netmsg_cache). Thus, if something touches the unused caches, it will raise a KASAN warning/exception. It poisons the object when the object is put to the cache, and unpoisons it when the object is gotten or freed. Signed-off-by: Breno Leitao Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/20230223164353.2839177-2-leitao@debian.org Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/alloc_cache.h | 6 +++++- io_uring/io_uring.c | 4 ++-- io_uring/net.h | 5 ++++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 757105ebd8af..3d152bdcd30a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -190,6 +190,7 @@ struct io_ev_fd { struct io_alloc_cache { struct io_wq_work_node list; unsigned int nr_cached; + size_t elem_size; }; struct io_ring_ctx { diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index aaa838c31d92..2fbecaa3a1ba 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -16,6 +16,8 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { cache->nr_cached++; wq_stack_add_head(&entry->node, &cache->list); + /* KASAN poisons object */ + kasan_slab_free_mempool(entry); return true; } return false; @@ -27,6 +29,7 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c struct io_cache_entry *entry; entry = container_of(cache->list.next, struct io_cache_entry, node); + kasan_unpoison_range(entry, cache->elem_size); cache->list.next = cache->list.next->next; cache->nr_cached--; return entry; @@ -35,10 +38,11 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size) { cache->list.next = NULL; cache->nr_cached = 0; + cache->elem_size = size; } static inline void io_alloc_cache_free(struct io_alloc_cache *cache, diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d72aa92ce2d6..24be4992821b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,8 +310,8 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->apoll_cache); - io_alloc_cache_init(&ctx->netmsg_cache); + io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); diff --git a/io_uring/net.h b/io_uring/net.h index 5ffa11bf5d2e..191009979bcb 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -5,8 +5,8 @@ #include "alloc_cache.h" -#if defined(CONFIG_NET) struct io_async_msghdr { +#if defined(CONFIG_NET) union { struct iovec fast_iov[UIO_FASTIOV]; struct { @@ -22,8 +22,11 @@ struct io_async_msghdr { struct sockaddr __user *uaddr; struct msghdr msg; struct sockaddr_storage addr; +#endif }; +#if defined(CONFIG_NET) + struct io_async_connect { struct sockaddr_storage address; }; -- cgit v1.2.3 From a282967c848fb1d92c28334430c472da9c334e54 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 27 Mar 2023 16:38:15 +0100 Subject: io_uring: encapsulate task_work state For task works we're passing around a bool pointer for whether the current ring is locked or not, let's wrap it in a structure, that will make it more opaque preventing abuse and will also help us to pass more info in the future if needed. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1ecec9483d58696e248d1bfd52cf62b04442df1d.1679931367.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 7 ++++- io_uring/io_uring.c | 71 +++++++++++++++++++++--------------------- io_uring/io_uring.h | 14 ++++----- io_uring/notif.c | 4 +-- io_uring/poll.c | 32 +++++++++---------- io_uring/rw.c | 6 ++-- io_uring/timeout.c | 14 ++++----- io_uring/uring_cmd.c | 4 +-- 8 files changed, 79 insertions(+), 73 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 3d152bdcd30a..561fa421c453 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -367,6 +367,11 @@ struct io_ring_ctx { unsigned evfd_last_cq_tail; }; +struct io_tw_state { + /* ->uring_lock is taken, callbacks can use io_tw_lock to lock it */ + bool locked; +}; + enum { REQ_F_FIXED_FILE_BIT = IOSQE_FIXED_FILE_BIT, REQ_F_IO_DRAIN_BIT = IOSQE_IO_DRAIN_BIT, @@ -473,7 +478,7 @@ enum { REQ_F_HASH_LOCKED = BIT(REQ_F_HASH_LOCKED_BIT), }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); struct io_task_work { struct llist_node node; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 2669aca0ba39..536940675c67 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -247,12 +247,12 @@ static __cold void io_fallback_req_func(struct work_struct *work) fallback_work.work); struct llist_node *node = llist_del_all(&ctx->fallback_llist); struct io_kiocb *req, *tmp; - bool locked = true; + struct io_tw_state ts = { .locked = true, }; mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &locked); - if (WARN_ON_ONCE(!locked)) + req->io_task_work.func(req, &ts); + if (WARN_ON_ONCE(!ts.locked)) return; io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); @@ -457,7 +457,7 @@ static void io_prep_async_link(struct io_kiocb *req) } } -void io_queue_iowq(struct io_kiocb *req, bool *dont_use) +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use) { struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; @@ -1153,22 +1153,23 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) { if (!ctx) return; if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); - *locked = false; + ts->locked = false; } percpu_ref_put(&ctx->refs); } static unsigned int handle_tw_list(struct llist_node *node, - struct io_ring_ctx **ctx, bool *locked, + struct io_ring_ctx **ctx, + struct io_tw_state *ts, struct llist_node *last) { unsigned int count = 0; @@ -1181,17 +1182,17 @@ static unsigned int handle_tw_list(struct llist_node *node, prefetch(container_of(next, struct io_kiocb, io_task_work.node)); if (req->ctx != *ctx) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = req->ctx; /* if not contended, grab and improve batching */ - *locked = mutex_trylock(&(*ctx)->uring_lock); + ts->locked = mutex_trylock(&(*ctx)->uring_lock); percpu_ref_get(&(*ctx)->refs); } - req->io_task_work.func(req, locked); + req->io_task_work.func(req, ts); node = next; count++; if (unlikely(need_resched())) { - ctx_flush_and_put(*ctx, locked); + ctx_flush_and_put(*ctx, ts); *ctx = NULL; cond_resched(); } @@ -1232,7 +1233,7 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head, void tctx_task_work(struct callback_head *cb) { - bool uring_locked = false; + struct io_tw_state ts = {}; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -1249,12 +1250,12 @@ void tctx_task_work(struct callback_head *cb) do { loops++; node = io_llist_xchg(&tctx->task_list, &fake); - count += handle_tw_list(node, &ctx, &uring_locked, &fake); + count += handle_tw_list(node, &ctx, &ts, &fake); /* skip expensive cmpxchg if there are items in the list */ if (READ_ONCE(tctx->task_list.first) != &fake) continue; - if (uring_locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { + if (ts.locked && !wq_list_empty(&ctx->submit_state.compl_reqs)) { io_submit_flush_completions(ctx); if (READ_ONCE(tctx->task_list.first) != &fake) continue; @@ -1262,7 +1263,7 @@ void tctx_task_work(struct callback_head *cb) node = io_llist_cmpxchg(&tctx->task_list, &fake, NULL); } while (node != &fake); - ctx_flush_and_put(ctx, &uring_locked); + ctx_flush_and_put(ctx, &ts); /* relaxed read is enough as only the task itself sets ->in_cancel */ if (unlikely(atomic_read(&tctx->in_cancel))) @@ -1351,7 +1352,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -static int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) +static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts) { struct llist_node *node; unsigned int loops = 0; @@ -1368,7 +1369,7 @@ again: struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, locked); + req->io_task_work.func(req, ts); ret++; node = next; } @@ -1376,7 +1377,7 @@ again: if (!llist_empty(&ctx->work_llist)) goto again; - if (*locked) { + if (ts->locked) { io_submit_flush_completions(ctx); if (!llist_empty(&ctx->work_llist)) goto again; @@ -1387,46 +1388,46 @@ again: static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { - bool locked; + struct io_tw_state ts = { .locked = true, }; int ret; if (llist_empty(&ctx->work_llist)) return 0; - locked = true; - ret = __io_run_local_work(ctx, &locked); + ret = __io_run_local_work(ctx, &ts); /* shouldn't happen! */ - if (WARN_ON_ONCE(!locked)) + if (WARN_ON_ONCE(!ts.locked)) mutex_lock(&ctx->uring_lock); return ret; } static int io_run_local_work(struct io_ring_ctx *ctx) { - bool locked = mutex_trylock(&ctx->uring_lock); + struct io_tw_state ts = {}; int ret; - ret = __io_run_local_work(ctx, &locked); - if (locked) + ts.locked = mutex_trylock(&ctx->uring_lock); + ret = __io_run_local_work(ctx, &ts); + if (ts.locked) mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, bool *locked) +static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, bool *locked) +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) - io_queue_iowq(req, locked); + io_queue_iowq(req, ts); else io_queue_sqe(req); } @@ -1652,9 +1653,9 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) return ret; } -void io_req_task_complete(struct io_kiocb *req, bool *locked) +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) { - if (*locked) + if (ts->locked) io_req_complete_defer(req); else io_req_complete_post(req, IO_URING_F_UNLOCKED); @@ -1933,9 +1934,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } -int io_poll_issue(struct io_kiocb *req, bool *locked) +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| IO_URING_F_COMPLETE_DEFER); } diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 2711865f1e19..c33f719731ac 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -52,16 +52,16 @@ void __io_req_task_work_add(struct io_kiocb *req, bool allow_local); bool io_is_uring_fops(struct file *file); bool io_alloc_async_data(struct io_kiocb *req); void io_req_task_queue(struct io_kiocb *req); -void io_queue_iowq(struct io_kiocb *req, bool *dont_use); -void io_req_task_complete(struct io_kiocb *req, bool *locked); +void io_queue_iowq(struct io_kiocb *req, struct io_tw_state *ts_dont_use); +void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, bool *locked); +void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); int io_uring_alloc_task_context(struct task_struct *task, struct io_ring_ctx *ctx); -int io_poll_issue(struct io_kiocb *req, bool *locked); +int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node); @@ -299,11 +299,11 @@ static inline bool io_task_work_pending(struct io_ring_ctx *ctx) return task_work_pending(current) || !wq_list_empty(&ctx->work_llist); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) { - if (!*locked) { + if (!ts->locked) { mutex_lock(&ctx->uring_lock); - *locked = true; + ts->locked = true; } } diff --git a/io_uring/notif.c b/io_uring/notif.c index 09dfd0832d19..172105eb347d 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -9,7 +9,7 @@ #include "notif.h" #include "rsrc.h" -static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) +static void io_notif_complete_tw_ext(struct io_kiocb *notif, struct io_tw_state *ts) { struct io_notif_data *nd = io_notif_to_data(notif); struct io_ring_ctx *ctx = notif->ctx; @@ -21,7 +21,7 @@ static void io_notif_complete_tw_ext(struct io_kiocb *notif, bool *locked) __io_unaccount_mem(ctx->user, nd->account_pages); nd->account_pages = 0; } - io_req_task_complete(notif, locked); + io_req_task_complete(notif, ts); } static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, diff --git a/io_uring/poll.c b/io_uring/poll.c index 55306e801081..c90e47dc1e29 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -148,7 +148,7 @@ static void io_poll_req_insert_locked(struct io_kiocb *req) hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) +static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; @@ -159,7 +159,7 @@ static void io_poll_tw_hash_eject(struct io_kiocb *req, bool *locked) * already grabbed the mutex for us, but there is a chance it * failed. */ - io_tw_lock(ctx, locked); + io_tw_lock(ctx, ts); hash_del(&req->hash_node); req->flags &= ~REQ_F_HASH_LOCKED; } else { @@ -238,7 +238,7 @@ enum { * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, bool *locked) +static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) { int v; @@ -300,13 +300,13 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, + if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data, mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, locked); + int ret = io_poll_issue(req, ts); if (ret == IOU_STOP_MULTISHOT) return IOU_POLL_REMOVE_POLL_USE_RES; if (ret < 0) @@ -326,15 +326,15 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) return IOU_POLL_NO_ACTION; } -static void io_poll_task_func(struct io_kiocb *req, bool *locked) +static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) { int ret; - ret = io_poll_check_events(req, locked); + ret = io_poll_check_events(req, ts); if (ret == IOU_POLL_NO_ACTION) return; io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + io_poll_tw_hash_eject(req, ts); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -343,7 +343,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -351,14 +351,14 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } else { - io_tw_lock(req->ctx, locked); + io_tw_lock(req->ctx, ts); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, locked); + io_req_task_submit(req, ts); else io_req_defer_failed(req, ret); } @@ -977,7 +977,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; - bool locked; + struct io_tw_state ts = {}; preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); ret2 = io_poll_disarm(preq); @@ -1027,8 +1027,8 @@ found: req_set_fail(preq); io_req_set_res(preq, -ECANCELED, 0); - locked = !(issue_flags & IO_URING_F_UNLOCKED); - io_req_task_complete(preq, &locked); + ts.locked = !(issue_flags & IO_URING_F_UNLOCKED); + io_req_task_complete(preq, &ts); out: if (ret < 0) { req_set_fail(req); diff --git a/io_uring/rw.c b/io_uring/rw.c index 4c233910e200..f14868624f41 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,16 +283,16 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -static void io_req_rw_complete(struct io_kiocb *req, bool *locked) +static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) { io_req_io_end(req); if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING)) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; req->cqe.flags |= io_put_kbuf(req, issue_flags); } - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } static void io_complete_rw(struct kiocb *kiocb, long res) diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 826a51bca3e4..5c6c6f720809 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -101,9 +101,9 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) +static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) { - io_tw_lock(link->ctx, locked); + io_tw_lock(link->ctx, ts); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -112,7 +112,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, bool *locked) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, locked); + io_req_task_complete(link, ts); link = nxt; } } @@ -265,9 +265,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) { - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; int ret = -ENOENT; @@ -282,11 +282,11 @@ static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) ret = io_try_cancel(req->task->io_uring, &cd, issue_flags); } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, locked); + io_req_task_complete(req, ts); } } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 9a1dee571872..3d825d939b13 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -12,10 +12,10 @@ #include "rsrc.h" #include "uring_cmd.h" -static void io_uring_cmd_work(struct io_kiocb *req, bool *locked) +static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - unsigned issue_flags = *locked ? 0 : IO_URING_F_UNLOCKED; + unsigned issue_flags = ts->locked ? 0 : IO_URING_F_UNLOCKED; ioucmd->task_work_cb(ioucmd, issue_flags); } -- cgit v1.2.3 From 8e15c0e71b8ae64fb7163532860f8d608165281f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:46 +0100 Subject: io_uring/rsrc: keep cached refs per node We cache refs of the current node (i.e. ctx->rsrc_node) in ctx->rsrc_cached_refs. We'll be moving away from atomics, so move the cached refs in struct io_rsrc_node for now. It's a prep patch and shouldn't change anything in practise. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/9edc3669c1d71b06c2dca78b2b2b8bb9292738b9.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/rsrc.c | 15 +++++++++------ io_uring/rsrc.h | 16 +++++++++------- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 561fa421c453..a0a5b5964d3a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -240,7 +240,6 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; - int rsrc_cached_refs; atomic_t cancel_seq; struct io_file_table file_table; unsigned nr_user_files; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f2da9e251e3f..1e7c960737fd 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -36,9 +36,11 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, void io_rsrc_refs_drop(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - if (ctx->rsrc_cached_refs) { - io_rsrc_put_node(ctx->rsrc_node, ctx->rsrc_cached_refs); - ctx->rsrc_cached_refs = 0; + struct io_rsrc_node *node = ctx->rsrc_node; + + if (node && node->cached_refs) { + io_rsrc_put_node(node, node->cached_refs); + node->cached_refs = 0; } } @@ -151,11 +153,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo *slot = NULL; } -void io_rsrc_refs_refill(struct io_ring_ctx *ctx) +void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node) __must_hold(&ctx->uring_lock) { - ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; - refcount_add(IO_RSRC_REF_BATCH, &ctx->rsrc_node->refs); + node->cached_refs += IO_RSRC_REF_BATCH; + refcount_add(IO_RSRC_REF_BATCH, &node->refs); } static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) @@ -300,6 +302,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, if (!ctx->rsrc_node) { ctx->rsrc_node = ctx->rsrc_backup_node; ctx->rsrc_backup_node = NULL; + ctx->rsrc_node->cached_refs = 0; } } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 1467b31843bc..950535e2b9f4 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -43,6 +43,7 @@ struct io_rsrc_node { struct io_rsrc_data *rsrc_data; struct llist_node llist; bool done; + int cached_refs; }; struct io_mapped_ubuf { @@ -56,7 +57,7 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_rsrc_refs_refill(struct io_ring_ctx *ctx); +void io_rsrc_refs_refill(struct io_ring_ctx *ctx, struct io_rsrc_node *node); void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); void io_rsrc_refs_drop(struct io_ring_ctx *ctx); @@ -128,17 +129,18 @@ static inline void io_req_put_rsrc_locked(struct io_kiocb *req, if (node) { if (node == ctx->rsrc_node) - ctx->rsrc_cached_refs++; + node->cached_refs++; else io_rsrc_put_node(node, 1); } } -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx) +static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_node *node) { - ctx->rsrc_cached_refs--; - if (unlikely(ctx->rsrc_cached_refs < 0)) - io_rsrc_refs_refill(ctx); + node->cached_refs--; + if (unlikely(node->cached_refs < 0)) + io_rsrc_refs_refill(ctx, node); } static inline void io_req_set_rsrc_node(struct io_kiocb *req, @@ -151,7 +153,7 @@ static inline void io_req_set_rsrc_node(struct io_kiocb *req, lockdep_assert_held(&ctx->uring_lock); req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx); + io_charge_rsrc_node(ctx, ctx->rsrc_node); io_ring_submit_unlock(ctx, issue_flags); } } -- cgit v1.2.3 From 0a4813b1abdf06e44ce60cdebfd374cfd27c46bf Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:50 +0100 Subject: io_uring/rsrc: kill rsrc_ref_lock We use ->rsrc_ref_lock spinlock to protect ->rsrc_ref_list in io_rsrc_node_ref_zero(). Now we removed pcpu refcounting, which means io_rsrc_node_ref_zero() is not executed from the irq context as an RCU callback anymore, and we also put it under ->uring_lock. io_rsrc_node_switch(), which queues up nodes into the list, is also protected by ->uring_lock, so we can safely get rid of ->rsrc_ref_lock. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6b60af883c263551190b526a55ff2c9d5ae07141.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- io_uring/io_uring.c | 1 - io_uring/rsrc.c | 5 ----- 3 files changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index a0a5b5964d3a..9492889f00c0 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -333,8 +333,8 @@ struct io_ring_ctx { struct delayed_work rsrc_put_work; struct callback_head rsrc_put_tw; struct llist_head rsrc_put_llist; + /* protected by ->uring_lock */ struct list_head rsrc_ref_list; - spinlock_t rsrc_ref_lock; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 36a76c7b34f0..764df5694d73 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -325,7 +325,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 1237fc77c250..e122b6e5f9c5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -209,11 +209,9 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->rsrc_data->ctx->uring_lock) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; - unsigned long flags; bool first_add = false; unsigned long delay = HZ; - spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); node->done = true; /* if we are mid-quiesce then do not delay */ @@ -229,7 +227,6 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) list_del(&node->node); first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); } - spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); if (!first_add) return; @@ -268,9 +265,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_node *rsrc_node = ctx->rsrc_node; rsrc_node->rsrc_data = data_to_kill; - spin_lock_irq(&ctx->rsrc_ref_lock); list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - spin_unlock_irq(&ctx->rsrc_ref_lock); atomic_inc(&data_to_kill->refs); /* put master ref */ -- cgit v1.2.3 From 36b9818a5a84cb7c977fb723babca1c8d74f288f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:53 +0100 Subject: io_uring/rsrc: don't offload node free struct delayed_work rsrc_put_work was previously used to offload node freeing because io_rsrc_node_ref_zero() was previously called by RCU in the IRQ context. Now, as percpu refcounting is gone, we can do it eagerly at the spot without pushing it to a worker. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/13fb1aac1e8d068ad8fd4a0c6d0d157ab61b90c0.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 --- io_uring/io_uring.c | 6 ----- io_uring/rsrc.c | 59 +++--------------------------------------- 3 files changed, 4 insertions(+), 64 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 9492889f00c0..47496059e13a 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -330,9 +330,6 @@ struct io_ring_ctx { struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; - struct delayed_work rsrc_put_work; - struct callback_head rsrc_put_tw; - struct llist_head rsrc_put_llist; /* protected by ->uring_lock */ struct list_head rsrc_ref_list; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 764df5694d73..d6a0025afc31 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -326,9 +326,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); INIT_LIST_HEAD(&ctx->rsrc_ref_list); - INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); - init_task_work(&ctx->rsrc_put_tw, io_rsrc_put_tw); - init_llist_head(&ctx->rsrc_put_llist); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; @@ -2821,11 +2818,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) io_rsrc_node_destroy(ctx->rsrc_node); if (ctx->rsrc_backup_node) io_rsrc_node_destroy(ctx->rsrc_backup_node); - flush_delayed_work(&ctx->rsrc_put_work); - flush_delayed_work(&ctx->fallback_work); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); - WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); #if defined(CONFIG_UNIX) if (ctx->ring_sock) { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 95e71300bb35..0f4e245dee1b 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -145,15 +145,8 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, { struct io_ring_ctx *ctx = rsrc_data->ctx; - if (prsrc->tag) { - if (ctx->flags & IORING_SETUP_IOPOLL) { - mutex_lock(&ctx->uring_lock); - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - mutex_unlock(&ctx->uring_lock); - } else { - io_post_aux_cqe(ctx, prsrc->tag, 0, 0); - } - } + if (prsrc->tag) + io_post_aux_cqe(ctx, prsrc->tag, 0, 0); rsrc_data->do_put(ctx, prsrc); } @@ -176,32 +169,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) complete(&rsrc_data->done); } -void io_rsrc_put_work(struct work_struct *work) -{ - struct io_ring_ctx *ctx; - struct llist_node *node; - - ctx = container_of(work, struct io_ring_ctx, rsrc_put_work.work); - node = llist_del_all(&ctx->rsrc_put_llist); - - while (node) { - struct io_rsrc_node *ref_node; - struct llist_node *next = node->next; - - ref_node = llist_entry(node, struct io_rsrc_node, llist); - __io_rsrc_put_work(ref_node); - node = next; - } -} - -void io_rsrc_put_tw(struct callback_head *cb) -{ - struct io_ring_ctx *ctx = container_of(cb, struct io_ring_ctx, - rsrc_put_tw); - - io_rsrc_put_work(&ctx->rsrc_put_work.work); -} - void io_wait_rsrc_data(struct io_rsrc_data *data) { if (data && !atomic_dec_and_test(&data->refs)) @@ -217,34 +184,18 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) __must_hold(&node->rsrc_data->ctx->uring_lock) { struct io_ring_ctx *ctx = node->rsrc_data->ctx; - bool first_add = false; - unsigned long delay = HZ; node->done = true; - - /* if we are mid-quiesce then do not delay */ - if (node->rsrc_data->quiesce) - delay = 0; - while (!list_empty(&ctx->rsrc_ref_list)) { node = list_first_entry(&ctx->rsrc_ref_list, struct io_rsrc_node, node); /* recycle ref nodes in order */ if (!node->done) break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - if (!first_add) - return; - - if (ctx->submitter_task) { - if (!task_work_add(ctx->submitter_task, &ctx->rsrc_put_tw, - ctx->notify_method)) - return; + list_del(&node->node); + __io_rsrc_put_work(node); } - mod_delayed_work(system_wq, &ctx->rsrc_put_work, delay); } static struct io_rsrc_node *io_rsrc_node_alloc(void) @@ -320,13 +271,11 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, if (ret < 0) { atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ - flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); mutex_lock(&ctx->uring_lock); break; } - flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); if (!ret) { mutex_lock(&ctx->uring_lock); -- cgit v1.2.3 From 9eae8655f9cd2eeed99fb7a0d2bb22816c17e497 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:54 +0100 Subject: io_uring/rsrc: cache struct io_rsrc_node Add allocation cache for struct io_rsrc_node, it's always allocated and put under ->uring_lock, so it doesn't need any extra synchronisation around caches. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/252a9d9ef9654e6467af30fdc02f57c0118fb76e.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 11 +++++++++-- io_uring/rsrc.c | 23 +++++++++++++++-------- io_uring/rsrc.h | 9 +++++++-- 4 files changed, 32 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 47496059e13a..5d772e36e7fc 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -332,6 +332,7 @@ struct io_ring_ctx { /* protected by ->uring_lock */ struct list_head rsrc_ref_list; + struct io_alloc_cache rsrc_node_cache; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index d6a0025afc31..419d6f42935f 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,6 +310,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); + io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); @@ -2790,6 +2791,11 @@ static void io_req_caches_free(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } +static void io_rsrc_node_cache_free(struct io_cache_entry *entry) +{ + kfree(container_of(entry, struct io_rsrc_node, cache)); +} + static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -2815,9 +2821,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx->rsrc_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_node); if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx->rsrc_backup_node); + io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); @@ -2829,6 +2835,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) #endif WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); + io_alloc_cache_free(&ctx->rsrc_node_cache, io_rsrc_node_cache_free); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 0f4e245dee1b..345631091d80 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -164,7 +164,7 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) kfree(prsrc); } - io_rsrc_node_destroy(ref_node); + io_rsrc_node_destroy(rsrc_data->ctx, ref_node); if (atomic_dec_and_test(&rsrc_data->refs)) complete(&rsrc_data->done); } @@ -175,9 +175,10 @@ void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - kfree(ref_node); + if (!io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache)) + kfree(node); } void io_rsrc_node_ref_zero(struct io_rsrc_node *node) @@ -198,13 +199,19 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) } } -static struct io_rsrc_node *io_rsrc_node_alloc(void) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { struct io_rsrc_node *ref_node; + struct io_cache_entry *entry; - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; + entry = io_alloc_cache_get(&ctx->rsrc_node_cache); + if (entry) { + ref_node = container_of(entry, struct io_rsrc_node, cache); + } else { + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + } ref_node->refs = 1; INIT_LIST_HEAD(&ref_node->node); @@ -243,7 +250,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { if (ctx->rsrc_backup_node) return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(); + ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); return ctx->rsrc_backup_node ? 0 : -ENOMEM; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 11703082d125..3b9f4c57c47c 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -4,6 +4,8 @@ #include +#include "alloc_cache.h" + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -37,8 +39,11 @@ struct io_rsrc_data { }; struct io_rsrc_node { + union { + struct io_cache_entry cache; + struct io_rsrc_data *rsrc_data; + }; struct list_head node; - struct io_rsrc_data *rsrc_data; struct llist_node llist; int refs; bool done; @@ -65,7 +70,7 @@ void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); void io_wait_rsrc_data(struct io_rsrc_data *data); -void io_rsrc_node_destroy(struct io_rsrc_node *ref_node); +void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc); -- cgit v1.2.3 From 69bbc6ade9d9d4e3c556cb83e77b6f3cd9ad3d18 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 4 Apr 2023 13:39:57 +0100 Subject: io_uring/rsrc: add custom limit for node caching The number of entries in the rsrc node cache is limited to 512, which still seems unnecessarily large. Add per cache thresholds and set to to 32 for the rsrc node cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d0cd538b944dac0bf878e276fc0199f21e6bccea.1680576071.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/alloc_cache.h | 6 ++++-- io_uring/io_uring.c | 9 ++++++--- io_uring/rsrc.h | 2 ++ 4 files changed, 13 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5d772e36e7fc..4a6ce03a4903 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -190,6 +190,7 @@ struct io_ev_fd { struct io_alloc_cache { struct io_wq_work_node list; unsigned int nr_cached; + unsigned int max_cached; size_t elem_size; }; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 2fbecaa3a1ba..851a527afb5e 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -13,7 +13,7 @@ struct io_cache_entry { static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, struct io_cache_entry *entry) { - if (cache->nr_cached < IO_ALLOC_CACHE_MAX) { + if (cache->nr_cached < cache->max_cached) { cache->nr_cached++; wq_stack_add_head(&entry->node, &cache->list); /* KASAN poisons object */ @@ -38,10 +38,12 @@ static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *c return NULL; } -static inline void io_alloc_cache_init(struct io_alloc_cache *cache, size_t size) +static inline void io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, size_t size) { cache->list.next = NULL; cache->nr_cached = 0; + cache->max_cached = max_nr; cache->elem_size = size; } diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index da36fa1eeac9..ae90d2753e0d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -310,9 +310,12 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); INIT_LIST_HEAD(&ctx->io_buffers_cache); - io_alloc_cache_init(&ctx->rsrc_node_cache, sizeof(struct io_rsrc_node)); - io_alloc_cache_init(&ctx->apoll_cache, sizeof(struct async_poll)); - io_alloc_cache_init(&ctx->netmsg_cache, sizeof(struct io_async_msghdr)); + io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, + sizeof(struct io_rsrc_node)); + io_alloc_cache_init(&ctx->apoll_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct async_poll)); + io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_msghdr)); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->personalities, XA_FLAGS_ALLOC1); mutex_init(&ctx->uring_lock); diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 7ab9b2b2e757..8729f2fee256 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -6,6 +6,8 @@ #include "alloc_cache.h" +#define IO_NODE_ALLOC_CACHE_MAX 32 + #define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) -- cgit v1.2.3 From 8751d15426a31baaf40f7570263c27c3e5d1dc44 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 6 Apr 2023 14:20:12 +0100 Subject: io_uring: reduce scheduling due to tw Every task_work will try to wake the task to be executed, which causes excessive scheduling and additional overhead. For some tw it's justified, but others won't do much but post a single CQE. When a task waits for multiple cqes, every such task_work will wake it up. Instead, the task may give a hint about how many cqes it waits for, io_req_local_work_add() will compare against it and skip wake ups if #cqes + #tw is not enough to satisfy the waiting condition. Task_work that uses the optimisation should be simple enough and never post more than one CQE. It's also ignored for non DEFER_TASKRUN rings. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/d2b77e99d1e86624d8a69f7037d764b739dcd225.1680782017.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +- io_uring/io_uring.c | 68 +++++++++++++++++++++++++++++------------- io_uring/io_uring.h | 9 ++++++ io_uring/notif.c | 2 +- io_uring/notif.h | 2 +- io_uring/rw.c | 2 +- 6 files changed, 61 insertions(+), 25 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 4a6ce03a4903..fa621a508a01 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -296,7 +296,7 @@ struct io_ring_ctx { spinlock_t completion_lock; bool poll_multi_queue; - bool cq_waiting; + atomic_t cq_wait_nr; /* * ->iopoll_list is protected by the ctx->uring_lock for @@ -566,6 +566,7 @@ struct io_kiocb { atomic_t refs; atomic_t poll_refs; struct io_task_work io_task_work; + unsigned nr_tw; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ union { struct hlist_node hash_node; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 786ecfa01c54..8a327a81beaf 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1300,35 +1300,59 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx) } } -static void io_req_local_work_add(struct io_kiocb *req) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { struct io_ring_ctx *ctx = req->ctx; + unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *first; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + flags &= ~IOU_F_TWQ_LAZY_WAKE; + first = READ_ONCE(ctx->work_llist.first); do { + nr_tw_prev = 0; + if (first) { + struct io_kiocb *first_req = container_of(first, + struct io_kiocb, + io_task_work.node); + /* + * Might be executed at any moment, rely on + * SLAB_TYPESAFE_BY_RCU to keep it alive. + */ + nr_tw_prev = READ_ONCE(first_req->nr_tw); + } + nr_tw = nr_tw_prev + 1; + /* Large enough to fail the nr_wait comparison below */ + if (!(flags & IOU_F_TWQ_LAZY_WAKE)) + nr_tw = -1U; + + req->nr_tw = nr_tw; req->io_task_work.node.next = first; } while (!try_cmpxchg(&ctx->work_llist.first, &first, &req->io_task_work.node)); - if (first) - return; - - /* needed for the following wake up */ - smp_mb__after_atomic(); - - if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { - io_move_task_work_from_local(ctx); - return; + if (!first) { + if (unlikely(atomic_read(&req->task->io_uring->in_cancel))) { + io_move_task_work_from_local(ctx); + return; + } + if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) + atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); + if (ctx->has_evfd) + io_eventfd_signal(ctx); } - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) - atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); - if (ctx->has_evfd) - io_eventfd_signal(ctx); - - if (READ_ONCE(ctx->cq_waiting)) - wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); + nr_wait = atomic_read(&ctx->cq_wait_nr); + /* no one is waiting */ + if (!nr_wait) + return; + /* either not enough or the previous add has already woken it up */ + if (nr_wait > nr_tw || nr_tw_prev >= nr_wait) + return; + /* pairs with set_current_state() in io_cqring_wait() */ + smp_mb__after_atomic(); + wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE); } void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) @@ -1339,7 +1363,7 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) if (!(flags & IOU_F_TWQ_FORCE_NORMAL) && (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { rcu_read_lock(); - io_req_local_work_add(req); + io_req_local_work_add(req, flags); rcu_read_unlock(); return; } @@ -2625,7 +2649,9 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, unsigned long check_cq; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - WRITE_ONCE(ctx->cq_waiting, 1); + int nr_wait = (int) iowq.cq_tail - READ_ONCE(ctx->rings->cq.tail); + + atomic_set(&ctx->cq_wait_nr, nr_wait); set_current_state(TASK_INTERRUPTIBLE); } else { prepare_to_wait_exclusive(&ctx->cq_wait, &iowq.wq, @@ -2634,7 +2660,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, ret = io_cqring_wait_schedule(ctx, &iowq); __set_current_state(TASK_RUNNING); - WRITE_ONCE(ctx->cq_waiting, 0); + atomic_set(&ctx->cq_wait_nr, 0); if (ret < 0) break; @@ -4517,7 +4543,7 @@ static int __init io_uring_init(void) io_uring_optable_init(); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | - SLAB_ACCOUNT); + SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); return 0; }; __initcall(io_uring_init); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index cb4309a2acdc..ef449e43d493 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -18,6 +18,15 @@ enum { /* don't use deferred task_work */ IOU_F_TWQ_FORCE_NORMAL = 1, + + /* + * A hint to not wake right away but delay until there are enough of + * tw's queued to match the number of CQEs the task is waiting for. + * + * Must not be used wirh requests generating more than one CQE. + * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set. + */ + IOU_F_TWQ_LAZY_WAKE = 2, }; enum { diff --git a/io_uring/notif.c b/io_uring/notif.c index 172105eb347d..e1846a25dde1 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -31,7 +31,7 @@ static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, struct io_kiocb *notif = cmd_to_io_kiocb(nd); if (refcount_dec_and_test(&uarg->refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static void io_tx_ubuf_callback_ext(struct sk_buff *skb, struct ubuf_info *uarg, diff --git a/io_uring/notif.h b/io_uring/notif.h index c88c800cd89d..6dd1b30a468f 100644 --- a/io_uring/notif.h +++ b/io_uring/notif.h @@ -33,7 +33,7 @@ static inline void io_notif_flush(struct io_kiocb *notif) /* drop slot's master ref */ if (refcount_dec_and_test(&nd->uarg.refcnt)) - io_req_task_work_add(notif); + __io_req_task_work_add(notif, IOU_F_TWQ_LAZY_WAKE); } static inline int io_notif_account_mem(struct io_kiocb *notif, unsigned len) diff --git a/io_uring/rw.c b/io_uring/rw.c index f14868624f41..6c7d2654770e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -304,7 +304,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) return; io_req_set_res(req, io_fixup_rw_res(req, res), 0); req->io_task_work.func = io_req_rw_complete; - io_req_task_work_add(req); + __io_req_task_work_add(req, IOU_F_TWQ_LAZY_WAKE); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) -- cgit v1.2.3 From 528407b1e0ea51260fff2cc8b669c632a65d7a09 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 11 Apr 2023 12:06:05 +0100 Subject: io_uring/rsrc: consolidate node caching We store one pre-allocated rsrc node in ->rsrc_backup_node, merge it with ->rsrc_node_cache. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6d5410e51ccd29be7a716be045b51d6b371baef6.1681210788.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 - io_uring/alloc_cache.h | 5 +++++ io_uring/io_uring.c | 2 -- io_uring/rsrc.c | 20 +++++++++++--------- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fa621a508a01..40cab420b1bd 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -326,7 +326,6 @@ struct io_ring_ctx { struct io_restriction restrictions; /* slow path rsrc auxilary data, used by update/register */ - struct io_rsrc_node *rsrc_backup_node; struct io_mapped_ubuf *dummy_ubuf; struct io_rsrc_data *file_data; struct io_rsrc_data *buf_data; diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index 851a527afb5e..241245cb54a6 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -23,6 +23,11 @@ static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, return false; } +static inline bool io_alloc_cache_empty(struct io_alloc_cache *cache) +{ + return !cache->list.next; +} + static inline struct io_cache_entry *io_alloc_cache_get(struct io_alloc_cache *cache) { if (cache->list.next) { diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b171c26d331d..075bae8a2bb1 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2852,8 +2852,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) /* there are no registered resources left, nobody uses it */ if (ctx->rsrc_node) io_rsrc_node_destroy(ctx, ctx->rsrc_node); - if (ctx->rsrc_backup_node) - io_rsrc_node_destroy(ctx, ctx->rsrc_backup_node); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 24e4e2109549..73f9e10d9bf0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -230,7 +230,7 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) __must_hold(&ctx->uring_lock) { - WARN_ON_ONCE(!ctx->rsrc_backup_node); + WARN_ON_ONCE(io_alloc_cache_empty(&ctx->rsrc_node_cache)); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); if (data_to_kill) { @@ -245,18 +245,20 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, ctx->rsrc_node = NULL; } - if (!ctx->rsrc_node) { - ctx->rsrc_node = ctx->rsrc_backup_node; - ctx->rsrc_backup_node = NULL; - } + if (!ctx->rsrc_node) + ctx->rsrc_node = io_rsrc_node_alloc(ctx); } int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) { - if (ctx->rsrc_backup_node) - return 0; - ctx->rsrc_backup_node = io_rsrc_node_alloc(ctx); - return ctx->rsrc_backup_node ? 0 : -ENOMEM; + if (io_alloc_cache_empty(&ctx->rsrc_node_cache)) { + struct io_rsrc_node *node = kzalloc(sizeof(*node), GFP_KERNEL); + + if (!node) + return -ENOMEM; + io_alloc_cache_put(&ctx->rsrc_node_cache, &node->cache); + } + return 0; } __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, -- cgit v1.2.3 From 4ea15b56f0810f0d8795d475db1bb74b3a7c1b2f Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:08 +0100 Subject: io_uring/rsrc: use wq for quiescing Replace completions with waitqueues for rsrc data quiesce, the main wakeup condition is when data refs hit zero. Note that data refs are only changes under ->uring_lock, so we prepare before mutex_unlock() reacquire it after taking the lock back. This change will be needed in the next patch. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1d0dbc74b3b4fd67c8f01819e680c5e0da252956.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 1 + io_uring/rsrc.c | 18 ++++++++++++------ io_uring/rsrc.h | 1 - 4 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 40cab420b1bd..5c9645319770 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -333,6 +333,7 @@ struct io_ring_ctx { /* protected by ->uring_lock */ struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; + struct wait_queue_head rsrc_quiesce_wq; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 9083a8466ebf..3c1c8c788b7b 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -321,6 +321,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); + init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index d7e7528f7159..f9ce4076c73d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -158,6 +158,7 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; + struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; if (ref_node->inline_items) @@ -171,13 +172,13 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) io_rsrc_node_destroy(rsrc_data->ctx, ref_node); if (io_put_rsrc_data_ref(rsrc_data)) - complete(&rsrc_data->done); + wake_up_all(&ctx->rsrc_quiesce_wq); } void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (data && !io_put_rsrc_data_ref(data)) - wait_for_completion(&data->done); + if (data) + WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -257,6 +258,7 @@ int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) { + DEFINE_WAIT(we); int ret; /* As we may drop ->uring_lock, other task may have started quiesce */ @@ -273,7 +275,9 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, data->quiesce = true; do { + prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); @@ -285,12 +289,15 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, } break; } - wait_for_completion_interruptible(&data->done); + + schedule(); + __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; } while (data->refs); - data->quiesce = false; + finish_wait(&ctx->rsrc_quiesce_wq, &we); + data->quiesce = false; return ret; } @@ -366,7 +373,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, goto fail; } } - init_completion(&data->done); *pdata = data; return 0; fail: diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 88adcb0b7963..d93ba4e9742a 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -35,7 +35,6 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - struct completion done; int refs; bool quiesce; }; -- cgit v1.2.3 From 0b222eeb6514ba6c3457b667fa4f3645032e1fc9 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 13 Apr 2023 15:28:10 +0100 Subject: io_uring/rsrc: remove rsrc_data refs Instead of waiting for rsrc_data->refs to be downed to zero, check whether there are rsrc nodes queued for completion, that's easier then maintaining references. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/8e33fd143d83e11af3e386aea28eb6d6c6a1be10.1681395792.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 1 + io_uring/io_uring.c | 4 ++-- io_uring/rsrc.c | 32 ++++++++------------------------ io_uring/rsrc.h | 2 -- 4 files changed, 11 insertions(+), 28 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 5c9645319770..1b2a20a42413 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -334,6 +334,7 @@ struct io_ring_ctx { struct list_head rsrc_ref_list; struct io_alloc_cache rsrc_node_cache; struct wait_queue_head rsrc_quiesce_wq; + unsigned rsrc_quiesce; struct list_head io_buffers_pages; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 3c1c8c788b7b..3d43df8f1e4e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2831,8 +2831,8 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - io_wait_rsrc_data(ctx->buf_data); - io_wait_rsrc_data(ctx->file_data); + if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) + return; mutex_lock(&ctx->uring_lock); if (ctx->buf_data) diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index e634ef384724..5415a18844e0 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -31,11 +31,6 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static inline bool io_put_rsrc_data_ref(struct io_rsrc_data *rsrc_data) -{ - return !--rsrc_data->refs; -} - int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { unsigned long page_limit, cur_pages, new_pages; @@ -158,7 +153,6 @@ static void io_rsrc_put_work_one(struct io_rsrc_data *rsrc_data, static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) { struct io_rsrc_data *rsrc_data = ref_node->rsrc_data; - struct io_ring_ctx *ctx = rsrc_data->ctx; struct io_rsrc_put *prsrc, *tmp; if (ref_node->inline_items) @@ -171,14 +165,6 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) } io_rsrc_node_destroy(rsrc_data->ctx, ref_node); - if (io_put_rsrc_data_ref(rsrc_data)) - wake_up_all(&ctx->rsrc_quiesce_wq); -} - -void io_wait_rsrc_data(struct io_rsrc_data *data) -{ - if (data) - WARN_ON_ONCE(!io_put_rsrc_data_ref(data)); } void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -201,6 +187,8 @@ void io_rsrc_node_ref_zero(struct io_rsrc_node *node) list_del(&node->node); __io_rsrc_put_work(node); } + if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) + wake_up_all(&ctx->rsrc_quiesce_wq); } struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) @@ -235,7 +223,6 @@ void io_rsrc_node_switch(struct io_ring_ctx *ctx, if (WARN_ON_ONCE(!backup)) return; - data_to_kill->refs++; node->rsrc_data = data_to_kill; list_add_tail(&node->node, &ctx->rsrc_ref_list); /* put master ref */ @@ -269,8 +256,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, return ret; io_rsrc_node_switch(ctx, data); - /* kill initial ref */ - if (io_put_rsrc_data_ref(data)) + if (list_empty(&ctx->rsrc_ref_list)) return 0; if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { @@ -278,6 +264,7 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, smp_mb(); } + ctx->rsrc_quiesce++; data->quiesce = true; do { prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); @@ -286,12 +273,8 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, ret = io_run_task_work_sig(ctx); if (ret < 0) { mutex_lock(&ctx->uring_lock); - if (!data->refs) { + if (list_empty(&ctx->rsrc_ref_list)) ret = 0; - } else { - /* restore the master reference */ - data->refs++; - } break; } @@ -299,10 +282,12 @@ __cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, __set_current_state(TASK_RUNNING); mutex_lock(&ctx->uring_lock); ret = 0; - } while (data->refs); + } while (!list_empty(&ctx->rsrc_ref_list)); finish_wait(&ctx->rsrc_quiesce_wq, &we); data->quiesce = false; + ctx->rsrc_quiesce--; + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { atomic_set(&ctx->cq_wait_nr, 0); smp_mb(); @@ -371,7 +356,6 @@ __cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, data->nr = nr; data->ctx = ctx; data->do_put = do_put; - data->refs = 1; if (utags) { ret = -EFAULT; for (i = 0; i < nr; i++) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index d93ba4e9742a..5dd2fcb28069 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -35,7 +35,6 @@ struct io_rsrc_data { u64 **tags; unsigned int nr; rsrc_put_fn *do_put; - int refs; bool quiesce; }; @@ -69,7 +68,6 @@ struct io_mapped_ubuf { void io_rsrc_put_tw(struct callback_head *cb); void io_rsrc_node_ref_zero(struct io_rsrc_node *node); void io_rsrc_put_work(struct work_struct *work); -void io_wait_rsrc_data(struct io_rsrc_data *data); void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); int io_rsrc_node_switch_start(struct io_ring_ctx *ctx); struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); -- cgit v1.2.3