From bd32923e5f02fa7b04d487ec265dc8080d27a257 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 31 Mar 2025 17:18:02 +0100
Subject: io_uring: don't store bgid in req->buf_index

Pass buffer group id into the rest of helpers via struct buf_sel_arg
and remove all reassignments of req->buf_index back to bgid. Now, it
only stores buffer indexes, and the group is provided by callers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3ea9fa08113ecb4d9224b943e7806e80a324bdf9.1743437358.git.asml.silence@gmail.com
Link: https://lore.kernel.org/io-uring/0c01d76ff12986c2f48614db8610caff8f78c869.1743500909.git.asml.silence@gmail.com/
[axboe: fold in patch from second link]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index b44d201520d8..3b467879bca8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -653,8 +653,7 @@ struct io_kiocb {
 	u8				iopoll_completed;
 	/*
 	 * Can be either a fixed buffer index, or used with provided buffers.
-	 * For the latter, before issue it points to the buffer group ID,
-	 * and after selection it points to the buffer ID itself.
+	 * For the latter, it points to the selected buffer ID.
 	 */
 	u16				buf_index;
 
-- 
cgit v1.2.3


From 632b3186726984319e2337987de86a442407f30e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:19 +0100
Subject: io_uring/zcrx: move zcrx region to struct io_zcrx_ifq

Refill queue region is a part of zcrx and should stay in struct
io_zcrx_ifq. We can't have multiple queues without it, so move it there.

As a result there is no context global zcrx region anymore, and the
region is looked up together with its ifq. To protect a concurrent mmap
from seeing an inconsistent region we were protecting changes to
->zcrx_region with mmap_lock, but now it protect the publishing of the
ifq.

Reviewed-by: David Wei <dw@davidwei.uk>
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/24f1a728fc03d0166f16d099575457e10d9d90f2.1745141261.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 --
 io_uring/zcrx.c                | 20 ++++++++++++--------
 io_uring/zcrx.h                |  1 +
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 3b467879bca8..06d722289fc5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -448,8 +448,6 @@ struct io_ring_ctx {
 	struct io_mapped_region		ring_region;
 	/* used for optimised request parameter and wait argument passing  */
 	struct io_mapped_region		param_region;
-	/* just one zcrx per ring for now, will move to io_zcrx_ifq eventually */
-	struct io_mapped_region		zcrx_region;
 };
 
 /*
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index b7e65ae413b6..033284b695c7 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -167,12 +167,11 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 	if (size > rd->size)
 		return -EINVAL;
 
-	ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd,
-					 IORING_MAP_OFF_ZCRX_REGION);
+	ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION);
 	if (ret < 0)
 		return ret;
 
-	ptr = io_region_get_ptr(&ifq->ctx->zcrx_region);
+	ptr = io_region_get_ptr(&ifq->region);
 	ifq->rq_ring = (struct io_uring *)ptr;
 	ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
 	return 0;
@@ -180,7 +179,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 
 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 {
-	io_free_region(ifq->ctx, &ifq->ctx->zcrx_region);
+	if (WARN_ON_ONCE(ifq->ctx->ifq))
+		return;
+
+	io_free_region(ifq->ctx, &ifq->region);
 	ifq->rq_ring = NULL;
 	ifq->rqes = NULL;
 }
@@ -343,9 +345,9 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 {
 	lockdep_assert_held(&ctx->mmap_lock);
 
-	if (id != 0)
+	if (id != 0 || !ctx->ifq)
 		return NULL;
-	return &ctx->zcrx_region;
+	return &ctx->ifq->region;
 }
 
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
@@ -433,7 +435,8 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
-	ctx->ifq = ifq;
+	scoped_guard(mutex, &ctx->mmap_lock)
+		ctx->ifq = ifq;
 	return 0;
 err:
 	io_zcrx_ifq_free(ifq);
@@ -449,7 +452,8 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
 	if (!ifq)
 		return;
 
-	ctx->ifq = NULL;
+	scoped_guard(mutex, &ctx->mmap_lock)
+		ctx->ifq = NULL;
 	io_zcrx_ifq_free(ifq);
 }
 
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 43134e5c9213..e3c7c4e647f1 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -39,6 +39,7 @@ struct io_zcrx_ifq {
 	netdevice_tracker		netdev_tracker;
 	spinlock_t			lock;
 	struct mutex			dma_lock;
+	struct io_mapped_region		region;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
cgit v1.2.3


From 76f1cc98b23cefd1f0ae90c51f1fb837e5f46528 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 20 Apr 2025 10:31:20 +0100
Subject: io_uring/zcrx: add support for multiple ifqs

Allow the user to register multiple ifqs / zcrx contexts. With that we
can use multiple interfaces / interface queues in a single io_uring
instance.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/668b03bee03b5216564482edcfefbc2ee337dd30.1745141261.git.asml.silence@gmail.com
[axboe: fold in fix]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  5 ++-
 io_uring/io_uring.c            |  3 +-
 io_uring/net.c                 |  5 ++-
 io_uring/zcrx.c                | 73 +++++++++++++++++++++++++++++-------------
 4 files changed, 56 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 06d722289fc5..7e23e993280e 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -40,8 +40,6 @@ enum io_uring_cmd_flags {
 	IO_URING_F_TASK_DEAD		= (1 << 13),
 };
 
-struct io_zcrx_ifq;
-
 struct io_wq_work_node {
 	struct io_wq_work_node *next;
 };
@@ -394,7 +392,8 @@ struct io_ring_ctx {
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
-	struct io_zcrx_ifq		*ifq;
+	/* Stores zcrx object pointers of type struct io_zcrx_ifq */
+	struct xarray			zcrx_ctxs;
 
 	u32			pers_next;
 	struct xarray		personalities;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 75c022526548..0dc6c2f1295e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
 	INIT_HLIST_HEAD(&ctx->waitid_list);
+	xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC);
 #ifdef CONFIG_FUTEX
 	INIT_HLIST_HEAD(&ctx->futex_list);
 #endif
@@ -2889,7 +2890,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 			io_cqring_overflow_kill(ctx);
 			mutex_unlock(&ctx->uring_lock);
 		}
-		if (ctx->ifq) {
+		if (!xa_empty(&ctx->zcrx_ctxs)) {
 			mutex_lock(&ctx->uring_lock);
 			io_shutdown_zcrx_ifqs(ctx);
 			mutex_unlock(&ctx->uring_lock);
diff --git a/io_uring/net.c b/io_uring/net.c
index 782f8e76c5c7..b3a643675ce8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1189,11 +1189,10 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx);
-	if (ifq_idx != 0)
-		return -EINVAL;
-	zc->ifq = req->ctx->ifq;
+	zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx);
 	if (!zc->ifq)
 		return -EINVAL;
+
 	zc->len = READ_ONCE(sqe->len);
 	zc->flags = READ_ONCE(sqe->ioprio);
 	zc->msg_flags = READ_ONCE(sqe->msg_flags);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 033284b695c7..22f420d6fbb9 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -156,8 +156,10 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
 
 static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 				 struct io_uring_zcrx_ifq_reg *reg,
-				 struct io_uring_region_desc *rd)
+				 struct io_uring_region_desc *rd,
+				 u32 id)
 {
+	u64 mmap_offset;
 	size_t off, size;
 	void *ptr;
 	int ret;
@@ -167,7 +169,10 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 	if (size > rd->size)
 		return -EINVAL;
 
-	ret = io_create_region(ifq->ctx, &ifq->region, rd, IORING_MAP_OFF_ZCRX_REGION);
+	mmap_offset = IORING_MAP_OFF_ZCRX_REGION;
+	mmap_offset += id << IORING_OFF_PBUF_SHIFT;
+
+	ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset);
 	if (ret < 0)
 		return ret;
 
@@ -179,9 +184,6 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
 
 static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
 {
-	if (WARN_ON_ONCE(ifq->ctx->ifq))
-		return;
-
 	io_free_region(ifq->ctx, &ifq->region);
 	ifq->rq_ring = NULL;
 	ifq->rqes = NULL;
@@ -343,11 +345,11 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
 struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx,
 					    unsigned int id)
 {
+	struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id);
+
 	lockdep_assert_held(&ctx->mmap_lock);
 
-	if (id != 0 || !ctx->ifq)
-		return NULL;
-	return &ctx->ifq->region;
+	return ifq ? &ifq->region : NULL;
 }
 
 int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
@@ -359,6 +361,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	struct io_uring_region_desc rd;
 	struct io_zcrx_ifq *ifq;
 	int ret;
+	u32 id;
 
 	/*
 	 * 1. Interface queue allocation.
@@ -371,8 +374,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN &&
 	      ctx->flags & IORING_SETUP_CQE32))
 		return -EINVAL;
-	if (ctx->ifq)
-		return -EBUSY;
 	if (copy_from_user(&reg, arg, sizeof(reg)))
 		return -EFAULT;
 	if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd)))
@@ -396,7 +397,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	if (!ifq)
 		return -ENOMEM;
 
-	ret = io_allocate_rbuf_ring(ifq, &reg, &rd);
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		/* preallocate id */
+		ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL);
+		if (ret)
+			goto ifq_free;
+	}
+
+	ret = io_allocate_rbuf_ring(ifq, &reg, &rd, id);
 	if (ret)
 		goto err;
 
@@ -428,6 +436,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 	reg.offsets.rqes = sizeof(struct io_uring);
 	reg.offsets.head = offsetof(struct io_uring, head);
 	reg.offsets.tail = offsetof(struct io_uring, tail);
+	reg.zcrx_id = id;
+
+	scoped_guard(mutex, &ctx->mmap_lock) {
+		/* publish ifq */
+		ret = -ENOMEM;
+		if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL))
+			goto err;
+	}
 
 	if (copy_to_user(arg, &reg, sizeof(reg)) ||
 	    copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) ||
@@ -435,26 +451,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx,
 		ret = -EFAULT;
 		goto err;
 	}
-	scoped_guard(mutex, &ctx->mmap_lock)
-		ctx->ifq = ifq;
 	return 0;
 err:
+	scoped_guard(mutex, &ctx->mmap_lock)
+		xa_erase(&ctx->zcrx_ctxs, id);
+ifq_free:
 	io_zcrx_ifq_free(ifq);
 	return ret;
 }
 
 void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
-	struct io_zcrx_ifq *ifq = ctx->ifq;
+	struct io_zcrx_ifq *ifq;
+	unsigned long id;
 
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (!ifq)
-		return;
+	while (1) {
+		scoped_guard(mutex, &ctx->mmap_lock) {
+			ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT);
+			if (ifq)
+				xa_erase(&ctx->zcrx_ctxs, id);
+		}
+		if (!ifq)
+			break;
+		io_zcrx_ifq_free(ifq);
+	}
 
-	scoped_guard(mutex, &ctx->mmap_lock)
-		ctx->ifq = NULL;
-	io_zcrx_ifq_free(ifq);
+	xa_destroy(&ctx->zcrx_ctxs);
 }
 
 static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area)
@@ -511,12 +535,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq)
 
 void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx)
 {
+	struct io_zcrx_ifq *ifq;
+	unsigned long index;
+
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (!ctx->ifq)
-		return;
-	io_zcrx_scrub(ctx->ifq);
-	io_close_queue(ctx->ifq);
+	xa_for_each(&ctx->zcrx_ctxs, index, ifq) {
+		io_zcrx_scrub(ifq);
+		io_close_queue(ifq);
+	}
 }
 
 static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq)
-- 
cgit v1.2.3


From 63de899cb6220357dea9d0f4e5aa459ff5193bb0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:53 +0100
Subject: io_uring: count allocated requests

Keep track of the number requests a ring currently has allocated (and
not freed), it'll be needed in the next patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/c8f8308294dc2a1cb8925d984d937d4fc14ab5d4.1746788718.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 1 +
 io_uring/io_uring.c            | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 7e23e993280e..73b289b48280 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -435,6 +435,7 @@ struct io_ring_ctx {
 
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
+	unsigned			nr_req_allocated;
 
 	/*
 	 * Protection for resize vs mmap races - both the mmap and resize
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 6efecb46c828..714b66ab34b0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -957,6 +957,8 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	}
 
 	percpu_ref_get_many(&ctx->refs, ret);
+	ctx->nr_req_allocated += ret;
+
 	while (ret--) {
 		struct io_kiocb *req = reqs[ret];
 
@@ -2694,8 +2696,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		kmem_cache_free(req_cachep, req);
 		nr++;
 	}
-	if (nr)
+	if (nr) {
+		ctx->nr_req_allocated -= nr;
 		percpu_ref_put_many(&ctx->refs, nr);
+	}
 	mutex_unlock(&ctx->uring_lock);
 }
 
@@ -2732,6 +2736,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
 	io_req_caches_free(ctx);
+
+	WARN_ON_ONCE(ctx->nr_req_allocated);
+
 	if (ctx->hash_map)
 		io_wq_put_hash(ctx->hash_map);
 	io_napi_free(ctx);
-- 
cgit v1.2.3


From 8fb7aee05591fd4d3dca1460448a59e95fa821c3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 9 May 2025 12:12:54 +0100
Subject: io_uring: drain based on allocates reqs

Don't rely on CQ sequence numbers for draining, as it has become messy
and needs cq_extra adjustments. Instead, base it on the number of
allocated requests and only allow flushing when all requests are in the
drain list.

As a result, cq_extra is gone, no overhead for its accounting in aux cqe
posting, less bloating as it was inlined before, and it's in general
simpler than trying to track where we should bump it and where it should
be put back like in cases of overflow. Also, it'll likely help with
cleaning and unifying some of the CQ posting helpers.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/46ece1e34320b046c06fee2498d6b4cd12a700f2.1746788718.git.asml.silence@gmail.com
Link: https://lore.kernel.org/r/24497b04b004bceada496033d3c9d09ff8e81ae9.1746944903.git.asml.silence@gmail.com
[axboe: fold in fix from link2]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 79 +++++++++++++++++-------------------------
 io_uring/io_uring.h            |  3 +-
 3 files changed, 34 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 73b289b48280..00dbd7cd0e7d 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -341,7 +341,6 @@ struct io_ring_ctx {
 		unsigned		cached_cq_tail;
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
-		unsigned		cq_extra;
 
 		void			*cq_wait_arg;
 		size_t			cq_wait_size;
@@ -417,6 +416,7 @@ struct io_ring_ctx {
 
 	struct callback_head		poll_wq_task_work;
 	struct list_head		defer_list;
+	unsigned			nr_drained;
 
 	struct io_alloc_cache		msg_cache;
 	spinlock_t			msg_lock;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 714b66ab34b0..9a9b8d35349b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -129,7 +129,6 @@
 struct io_defer_entry {
 	struct list_head	list;
 	struct io_kiocb		*req;
-	u32			seq;
 };
 
 /* requests with any of those set should undergo io_disarm_next() */
@@ -149,6 +148,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 bool is_sqpoll_thread);
 
 static void io_queue_sqe(struct io_kiocb *req);
+static void __io_req_caches_free(struct io_ring_ctx *ctx);
 
 static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray);
 
@@ -540,46 +540,45 @@ void io_req_queue_iowq(struct io_kiocb *req)
 	io_req_task_work_add(req);
 }
 
-static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
+static unsigned io_linked_nr(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *tmp;
+	unsigned nr = 0;
 
-	return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
+	io_for_each_link(tmp, req)
+		nr++;
+	return nr;
 }
 
-static __cold noinline void __io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	bool drain_seen = false, first = true;
 
+	lockdep_assert_held(&ctx->uring_lock);
+	__io_req_caches_free(ctx);
+
 	while (!list_empty(&ctx->defer_list)) {
 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 						struct io_defer_entry, list);
 
 		drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
-		if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
-			break;
+		if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained)
+			return;
 
 		list_del_init(&de->list);
+		ctx->nr_drained -= io_linked_nr(de->req);
 		io_req_task_queue(de->req);
 		kfree(de);
 		first = false;
 	}
 }
 
-static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
-{
-	guard(spinlock)(&ctx->completion_lock);
-	__io_queue_deferred(ctx);
-}
-
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
 	if (ctx->poll_activated)
 		io_poll_wq_wake(ctx);
 	if (ctx->off_timeout_used)
 		io_flush_timeouts(ctx);
-	if (ctx->drain_active)
-		io_queue_deferred(ctx);
 	if (ctx->has_evfd)
 		io_eventfd_signal(ctx, true);
 }
@@ -742,7 +741,6 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
 		 * on the floor.
 		 */
 		WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1);
-		ctx->cq_extra--;
 		set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq);
 		return false;
 	}
@@ -812,8 +810,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 {
 	struct io_uring_cqe *cqe;
 
-	ctx->cq_extra++;
-
 	if (likely(io_get_cqe(ctx, &cqe))) {
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
@@ -1459,6 +1455,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 		io_free_batch_list(ctx, state->compl_reqs.first);
 		INIT_WQ_LIST(&state->compl_reqs);
 	}
+
+	if (unlikely(ctx->drain_active))
+		io_queue_deferred(ctx);
+
 	ctx->submit_state.cq_flush = false;
 }
 
@@ -1646,17 +1646,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
 	return res;
 }
 
-static u32 io_get_sequence(struct io_kiocb *req)
-{
-	u32 seq = req->ctx->cached_sq_head;
-	struct io_kiocb *cur;
-
-	/* need original cached_sq_head, but it was increased for each req */
-	io_for_each_link(cur, req)
-		seq--;
-	return seq;
-}
-
 static __cold void io_drain_req(struct io_kiocb *req)
 	__must_hold(&ctx->uring_lock)
 {
@@ -1673,14 +1662,12 @@ static __cold void io_drain_req(struct io_kiocb *req)
 	io_prep_async_link(req);
 	trace_io_uring_defer(req);
 	de->req = req;
-	de->seq = io_get_sequence(req);
 
-	scoped_guard(spinlock, &ctx->completion_lock) {
-		list_add_tail(&de->list, &ctx->defer_list);
-		__io_queue_deferred(ctx);
-		if (!drain && list_empty(&ctx->defer_list))
-			ctx->drain_active = false;
-	}
+	ctx->nr_drained += io_linked_nr(req);
+	list_add_tail(&de->list, &ctx->defer_list);
+	io_queue_deferred(ctx);
+	if (!drain && list_empty(&ctx->defer_list))
+		ctx->drain_active = false;
 }
 
 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
@@ -2263,10 +2250,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
 	    (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) {
 		head = READ_ONCE(ctx->sq_array[head]);
 		if (unlikely(head >= ctx->sq_entries)) {
-			/* drop invalid entries */
-			spin_lock(&ctx->completion_lock);
-			ctx->cq_extra--;
-			spin_unlock(&ctx->completion_lock);
 			WRITE_ONCE(ctx->rings->sq_dropped,
 				   READ_ONCE(ctx->rings->sq_dropped) + 1);
 			return false;
@@ -2684,13 +2667,11 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries,
 	return off;
 }
 
-static void io_req_caches_free(struct io_ring_ctx *ctx)
+static __cold void __io_req_caches_free(struct io_ring_ctx *ctx)
 {
 	struct io_kiocb *req;
 	int nr = 0;
 
-	mutex_lock(&ctx->uring_lock);
-
 	while (!io_req_cache_empty(ctx)) {
 		req = io_extract_req(ctx);
 		kmem_cache_free(req_cachep, req);
@@ -2700,7 +2681,12 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 		ctx->nr_req_allocated -= nr;
 		percpu_ref_put_many(&ctx->refs, nr);
 	}
-	mutex_unlock(&ctx->uring_lock);
+}
+
+static __cold void io_req_caches_free(struct io_ring_ctx *ctx)
+{
+	guard(mutex)(&ctx->uring_lock);
+	__io_req_caches_free(ctx);
 }
 
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
@@ -3005,20 +2991,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
 	struct io_defer_entry *de;
 	LIST_HEAD(list);
 
-	spin_lock(&ctx->completion_lock);
 	list_for_each_entry_reverse(de, &ctx->defer_list, list) {
 		if (io_match_task_safe(de->req, tctx, cancel_all)) {
 			list_cut_position(&list, &ctx->defer_list, &de->list);
 			break;
 		}
 	}
-	spin_unlock(&ctx->completion_lock);
 	if (list_empty(&list))
 		return false;
 
 	while (!list_empty(&list)) {
 		de = list_first_entry(&list, struct io_defer_entry, list);
 		list_del_init(&de->list);
+		ctx->nr_drained -= io_linked_nr(de->req);
 		io_req_task_queue_fail(de->req, -ECANCELED);
 		kfree(de);
 	}
@@ -3093,8 +3078,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    io_allowed_defer_tw_run(ctx))
 		ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0;
-	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
 	mutex_lock(&ctx->uring_lock);
+	ret |= io_cancel_defer_files(ctx, tctx, cancel_all);
 	ret |= io_poll_remove_all(ctx, tctx, cancel_all);
 	ret |= io_waitid_remove_all(ctx, tctx, cancel_all);
 	ret |= io_futex_remove_all(ctx, tctx, cancel_all);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index e4050b2d0821..81f22196a57d 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -196,7 +196,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx,
 {
 	io_lockdep_assert_cq_locked(ctx);
 
-	ctx->cq_extra++;
 	ctx->submit_state.cq_flush = true;
 	return io_get_cqe(ctx, cqe_ret);
 }
@@ -414,7 +413,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req)
 
 static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx)
 {
-	if (unlikely(ctx->off_timeout_used || ctx->drain_active ||
+	if (unlikely(ctx->off_timeout_used ||
 		     ctx->has_evfd || ctx->poll_activated))
 		__io_commit_cqring_flush(ctx);
 }
-- 
cgit v1.2.3


From c80bdb1c55719cd6308d648a7920272a3be09e34 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 16 May 2025 13:16:44 -0600
Subject: io_uring: pass in struct io_big_cqe to io_alloc_ocqe()

Rather than pass extra1/extra2 separately, just pass in the (now) named
io_big_cqe struct instead. The callers that don't use/support CQE32 will
now just pass a single NULL, rather than two seperate mystery zero
values.

Move the clearing of the big_cqe elements into io_alloc_ocqe() as well,
so it can get moved out of the generic code.

Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 io_uring/io_uring.c            | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 00dbd7cd0e7d..2922635986f5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -710,7 +710,7 @@ struct io_kiocb {
 	const struct cred		*creds;
 	struct io_wq_work		work;
 
-	struct {
+	struct io_big_cqe {
 		u64			extra1;
 		u64			extra2;
 	} big_cqe;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 02d597716467..4081ffd890af 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -724,8 +724,8 @@ static bool io_cqring_add_overflow(struct io_ring_ctx *ctx,
 }
 
 static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
-					     struct io_cqe *cqe, u64 extra1,
-					     u64 extra2, gfp_t gfp)
+					     struct io_cqe *cqe,
+					     struct io_big_cqe *big_cqe, gfp_t gfp)
 {
 	struct io_overflow_cqe *ocqe;
 	size_t ocq_size = sizeof(struct io_overflow_cqe);
@@ -734,17 +734,19 @@ static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx,
 	if (is_cqe32)
 		ocq_size += sizeof(struct io_uring_cqe);
 
-	ocqe = kmalloc(ocq_size, gfp | __GFP_ACCOUNT);
+	ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT);
 	trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe);
 	if (ocqe) {
 		ocqe->cqe.user_data = cqe->user_data;
 		ocqe->cqe.res = cqe->res;
 		ocqe->cqe.flags = cqe->flags;
-		if (is_cqe32) {
-			ocqe->cqe.big_cqe[0] = extra1;
-			ocqe->cqe.big_cqe[1] = extra2;
+		if (is_cqe32 && big_cqe) {
+			ocqe->cqe.big_cqe[0] = big_cqe->extra1;
+			ocqe->cqe.big_cqe[1] = big_cqe->extra2;
 		}
 	}
+	if (big_cqe)
+		big_cqe->extra1 = big_cqe->extra2 = 0;
 	return ocqe;
 }
 
@@ -821,7 +823,7 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_ATOMIC);
+		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_ATOMIC);
 		filled = io_cqring_add_overflow(ctx, ocqe);
 	}
 	io_cq_unlock_post(ctx);
@@ -841,7 +843,7 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
 		struct io_overflow_cqe *ocqe;
 		struct io_cqe cqe = io_init_cqe(user_data, res, cflags);
 
-		ocqe = io_alloc_ocqe(ctx, &cqe, 0, 0, GFP_KERNEL);
+		ocqe = io_alloc_ocqe(ctx, &cqe, NULL, GFP_KERNEL);
 		spin_lock(&ctx->completion_lock);
 		io_cqring_add_overflow(ctx, ocqe);
 		spin_unlock(&ctx->completion_lock);
@@ -1451,8 +1453,7 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			gfp_t gfp = ctx->lockless_cq ? GFP_KERNEL : GFP_ATOMIC;
 			struct io_overflow_cqe *ocqe;
 
-			ocqe = io_alloc_ocqe(ctx, &req->cqe, req->big_cqe.extra1,
-					     req->big_cqe.extra2, gfp);
+			ocqe = io_alloc_ocqe(ctx, &req->cqe, &req->big_cqe, gfp);
 			if (ctx->lockless_cq) {
 				spin_lock(&ctx->completion_lock);
 				io_cqring_add_overflow(ctx, ocqe);
@@ -1460,8 +1461,6 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			} else {
 				io_cqring_add_overflow(ctx, ocqe);
 			}
-
-			memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 		}
 	}
 	__io_cq_unlock_post(ctx);
-- 
cgit v1.2.3