summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2026-06-11 00:19:35 +0300
committerJens Axboe <axboe@kernel.dk>2026-06-13 15:27:06 +0300
commitd46ab2c98ababa19b41a5709b6921d7b1add7f74 (patch)
treec3cab282d4637eac166b53c59e91e3c6618c93a6 /include
parent50cb44bd0d5f243919a06b17b1d979fdcd72cb2b (diff)
downloadlinux-d46ab2c98ababa19b41a5709b6921d7b1add7f74.tar.xz
io_uring: switch local task_work to a mpscq
The local (DEFER_TASKRUN) task_work list is an llist, which is LIFO ordered, and hence __io_run_local_work() has to restore the right running order with an O(n) llist_reverse_order() pass first. On top of that, a batch that gets capped by max_events needs the leftover entries parked on a separate ->retry_llist, as they can't be pushed back to the shared list. Switch it to the FIFO mpscq. Adds are wait-free instead of a cmpxchg retry loop, entries are popped in queue order with no reversal pass, capping a run simply leaves the remainder on the queue, and ->retry_llist goes away entirely. The consumer cursor, ->work_head, lives with the rest of the ->uring_lock protected state rather than next to the queue, so that popping entries doesn't dirty the producer side cacheline. For low amounts of task_work, this ends up being a bit more efficient than the existing scheme. As an example of that, doing multishot receives for 8 clients has the following task_work overhead: 1.02% sock-test [kernel.kallsyms] [k] io_req_local_work_add 0.88% sock-test [kernel.kallsyms] [k] __io_run_local_work_loop 0.60% sock-test [kernel.kallsyms] [k] llist_reverse_order 0.14% sock-test [kernel.kallsyms] [k] __io_run_local_work 2.64% at ~46Gb/sec and after this change: 1.08% sock-test [kernel.kallsyms] [k] io_req_local_work_add 1.03% sock-test [kernel.kallsyms] [k] __io_run_local_work 2.11% at ~53Gb/sec which has less overhead even though that test run was faster. For a case of having 1024 clients on a single ring: 2.22% sock-test [kernel.kallsyms] [k] llist_reverse_order 0.84% sock-test [kernel.kallsyms] [k] __io_run_local_work_loop 0.42% sock-test [kernel.kallsyms] [k] io_req_local_work_add 0.02% sock-test [kernel.kallsyms] [k] __io_run_local_work 3.50% at ~24Gb/sec we start to see the llist reversing taking a considerable amount of time, and the total add+run task_work overhead is around 3.5%. After the change: 0.90% sock-test [kernel.kallsyms] [k] __io_run_local_work 0.42% sock-test [kernel.kallsyms] [k] io_req_local_work_add 1.32% at ~26Gb/sec most of that overhead is gone, and performance is better as well. Caleb Sander Mateos <csander@purestorage.com> reports that it improves the performance of a ublk 4kb workload by 4% [1], while testing v1 of this patchset. [1] https://lore.kernel.org/io-uring/CADUfDZr-MMYBaP-e+y9+xuRhuiunO2sBTUCmwZyd7AgT8sVtiQ@mail.gmail.com/ Signed-off-by: Jens Axboe <axboe@kernel.dk>
Diffstat (limited to 'include')
-rw-r--r--include/linux/io_uring_types.h13
1 files changed, 9 insertions, 4 deletions
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 85e12b4884a5..3e07c7059d7b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -360,6 +360,14 @@ struct io_ring_ctx {
bool poll_multi_queue;
struct list_head iopoll_list;
+ /*
+ * Consumer cursor for ->work_list, protected by ->uring_lock.
+ * Deliberately kept away from the producer side of the queue,
+ * as it's written for every popped entry, and the producer
+ * cacheline is contended enough as it is.
+ */
+ struct llist_node *work_head;
+
struct io_file_table file_table;
struct io_rsrc_data buf_table;
struct io_alloc_cache node_cache;
@@ -417,8 +425,7 @@ struct io_ring_ctx {
*/
struct {
struct io_rings __rcu *rings_rcu;
- struct llist_head work_llist;
- struct llist_head retry_llist;
+ struct mpscq work_list;
unsigned long check_cq;
atomic_t cq_wait_nr;
atomic_t cq_timeouts;
@@ -742,8 +749,6 @@ struct io_kiocb {
*/
u16 buf_index;
- unsigned nr_tw;
-
/* REQ_F_* flags */
io_req_flags_t flags;