From 085268829b07202cf7bf8ec1a8fb7fd9d8f6a41a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Sep 2024 14:22:36 -0600
Subject: io_uring/poll: get rid of unlocked cancel hash

io_uring maintains two hash lists of inflight requests:

1) ctx->cancel_table_locked. This is used when the caller has the
   ctx->uring_lock held already. This is only an issue side parameter,
   as removal or task_work will always have it held.

2) ctx->cancel_table. This is used when the issuer does NOT have the
   ctx->uring_lock held, and relies on the table spinlocks for access.

However, it's pretty trivial to simply grab the lock in the one spot
where we care about it, for insertion. With that, we can kill the
unlocked table (and get rid of the _locked postfix for the other one).

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 4b9ba523978d..d8ca27da1341 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -291,7 +291,7 @@ struct io_ring_ctx {
 
 		struct xarray		io_bl_xa;
 
-		struct io_hash_table	cancel_table_locked;
+		struct io_hash_table	cancel_table;
 		struct io_alloc_cache	apoll_cache;
 		struct io_alloc_cache	netmsg_cache;
 		struct io_alloc_cache	rw_cache;
@@ -342,7 +342,6 @@ struct io_ring_ctx {
 
 	struct list_head	io_buffers_comp;
 	struct list_head	cq_overflow_list;
-	struct io_hash_table	cancel_table;
 
 	struct hlist_head	waitid_list;
 
@@ -459,7 +458,6 @@ enum {
 	REQ_F_DOUBLE_POLL_BIT,
 	REQ_F_APOLL_MULTISHOT_BIT,
 	REQ_F_CLEAR_POLLIN_BIT,
-	REQ_F_HASH_LOCKED_BIT,
 	/* keep async read/write and isreg together and in order */
 	REQ_F_SUPPORT_NOWAIT_BIT,
 	REQ_F_ISREG_BIT,
@@ -534,8 +532,6 @@ enum {
 	REQ_F_APOLL_MULTISHOT	= IO_REQ_FLAG(REQ_F_APOLL_MULTISHOT_BIT),
 	/* recvmsg special flag, clear EPOLLIN */
 	REQ_F_CLEAR_POLLIN	= IO_REQ_FLAG(REQ_F_CLEAR_POLLIN_BIT),
-	/* hashed into ->cancel_hash_locked, protected by ->uring_lock */
-	REQ_F_HASH_LOCKED	= IO_REQ_FLAG(REQ_F_HASH_LOCKED_BIT),
 	/* don't use lazy poll wake for this request */
 	REQ_F_POLL_NO_LAZY	= IO_REQ_FLAG(REQ_F_POLL_NO_LAZY_BIT),
 	/* file is pollable */
-- 
cgit v1.2.3


From ba4366f57b117c2eab996642288e5c75646ccfc9 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 30 Sep 2024 14:29:06 -0600
Subject: io_uring/poll: get rid of per-hashtable bucket locks

Any access to the table is protected by ctx->uring_lock now anyway, the
per-bucket locking doesn't buy us anything.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  1 -
 io_uring/cancel.c              |  4 +---
 io_uring/poll.c                | 39 ++++++++++-----------------------------
 3 files changed, 11 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d8ca27da1341..9c7e1d3f06e5 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -67,7 +67,6 @@ struct io_file_table {
 };
 
 struct io_hash_bucket {
-	spinlock_t		lock;
 	struct hlist_head	list;
 } ____cacheline_aligned_in_smp;
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index a6e58a20efdd..755dd5506a5f 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -236,10 +236,8 @@ void init_hash_table(struct io_hash_table *table, unsigned size)
 {
 	unsigned int i;
 
-	for (i = 0; i < size; i++) {
-		spin_lock_init(&table->hbs[i].lock);
+	for (i = 0; i < size; i++)
 		INIT_HLIST_HEAD(&table->hbs[i].list);
-	}
 }
 
 static int __io_sync_cancel(struct io_uring_task *tctx,
diff --git a/io_uring/poll.c b/io_uring/poll.c
index a0d1a09c5a20..2d6698fb7400 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -728,7 +728,6 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 	for (i = 0; i < nr_buckets; i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
-		spin_lock(&hb->lock);
 		hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) {
 			if (io_match_task_safe(req, tsk, cancel_all)) {
 				hlist_del_init(&req->hash_node);
@@ -736,22 +735,17 @@ __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 				found = true;
 			}
 		}
-		spin_unlock(&hb->lock);
 	}
 	return found;
 }
 
 static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
-				     struct io_cancel_data *cd,
-				     struct io_hash_bucket **out_bucket)
+				     struct io_cancel_data *cd)
 {
 	struct io_kiocb *req;
 	u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits);
 	struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index];
 
-	*out_bucket = NULL;
-
-	spin_lock(&hb->lock);
 	hlist_for_each_entry(req, &hb->list, hash_node) {
 		if (cd->data != req->cqe.user_data)
 			continue;
@@ -761,34 +755,25 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only,
 			if (io_cancel_match_sequence(req, cd->seq))
 				continue;
 		}
-		*out_bucket = hb;
 		return req;
 	}
-	spin_unlock(&hb->lock);
 	return NULL;
 }
 
 static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx,
-					  struct io_cancel_data *cd,
-					  struct io_hash_bucket **out_bucket)
+					  struct io_cancel_data *cd)
 {
 	unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits;
 	struct io_kiocb *req;
 	int i;
 
-	*out_bucket = NULL;
-
 	for (i = 0; i < nr_buckets; i++) {
 		struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i];
 
-		spin_lock(&hb->lock);
 		hlist_for_each_entry(req, &hb->list, hash_node) {
-			if (io_cancel_req_match(req, cd)) {
-				*out_bucket = hb;
+			if (io_cancel_req_match(req, cd))
 				return req;
-			}
 		}
-		spin_unlock(&hb->lock);
 	}
 	return NULL;
 }
@@ -806,20 +791,19 @@ static int io_poll_disarm(struct io_kiocb *req)
 
 static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd)
 {
-	struct io_hash_bucket *bucket;
 	struct io_kiocb *req;
 
 	if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP |
 			 IORING_ASYNC_CANCEL_ANY))
-		req = io_poll_file_find(ctx, cd, &bucket);
+		req = io_poll_file_find(ctx, cd);
 	else
-		req = io_poll_find(ctx, false, cd, &bucket);
+		req = io_poll_find(ctx, false, cd);
 
-	if (req)
+	if (req) {
 		io_poll_cancel_req(req);
-	if (bucket)
-		spin_unlock(&bucket->lock);
-	return req ? 0 : -ENOENT;
+		return 0;
+	}
+	return -ENOENT;
 }
 
 int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd,
@@ -918,15 +902,12 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags)
 	struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update);
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, };
-	struct io_hash_bucket *bucket;
 	struct io_kiocb *preq;
 	int ret2, ret = 0;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	preq = io_poll_find(ctx, true, &cd, &bucket);
+	preq = io_poll_find(ctx, true, &cd);
 	ret2 = io_poll_disarm(preq);
-	if (bucket)
-		spin_unlock(&bucket->lock);
 	if (ret2) {
 		ret = ret2;
 		goto out;
-- 
cgit v1.2.3


From 2946f08ae9ed650b94e0ffebcdfdda8de76bd926 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 18 Oct 2024 17:14:00 +0100
Subject: io_uring: clean up cqe trace points

We have too many helpers posting CQEs, instead of tracing completion
events before filling in a CQE and thus having to pass all the data,
set the CQE first, pass it to the tracing helper and let it extract
everything it needs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/b83c1ca9ee5aed2df0f3bb743bf5ed699cce4c86.1729267437.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h  |  5 +++++
 include/trace/events/io_uring.h | 24 +++++++++---------------
 io_uring/io_uring.c             |  4 ++--
 io_uring/io_uring.h             |  7 +++----
 4 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 9c7e1d3f06e5..391087144666 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -662,4 +662,9 @@ struct io_overflow_cqe {
 	struct io_uring_cqe cqe;
 };
 
+static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
+{
+	return ctx->flags & IORING_SETUP_CQE32;
+}
+
 #endif
diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
index 412c9c210a32..fb81c533b310 100644
--- a/include/trace/events/io_uring.h
+++ b/include/trace/events/io_uring.h
@@ -315,20 +315,14 @@ TRACE_EVENT(io_uring_fail_link,
  * io_uring_complete - called when completing an SQE
  *
  * @ctx:		pointer to a ring context structure
- * @req:		pointer to a submitted request
- * @user_data:		user data associated with the request
- * @res:		result of the request
- * @cflags:		completion flags
- * @extra1:		extra 64-bit data for CQE32
- * @extra2:		extra 64-bit data for CQE32
- *
+ * @req:		(optional) pointer to a submitted request
+ * @cqe:		pointer to the filled in CQE being posted
  */
 TRACE_EVENT(io_uring_complete,
 
-	TP_PROTO(void *ctx, void *req, u64 user_data, int res, unsigned cflags,
-		 u64 extra1, u64 extra2),
+TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),
 
-	TP_ARGS(ctx, req, user_data, res, cflags, extra1, extra2),
+	TP_ARGS(ctx, req, cqe),
 
 	TP_STRUCT__entry (
 		__field(  void *,	ctx		)
@@ -343,11 +337,11 @@ TRACE_EVENT(io_uring_complete,
 	TP_fast_assign(
 		__entry->ctx		= ctx;
 		__entry->req		= req;
-		__entry->user_data	= user_data;
-		__entry->res		= res;
-		__entry->cflags		= cflags;
-		__entry->extra1		= extra1;
-		__entry->extra2		= extra2;
+		__entry->user_data	= cqe->user_data;
+		__entry->res		= cqe->res;
+		__entry->cflags		= cqe->flags;
+		__entry->extra1		= io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
+		__entry->extra2		= io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
 	),
 
 	TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fa9d31034c62..58b401900b41 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -828,8 +828,6 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 	 * the ring.
 	 */
 	if (likely(io_get_cqe(ctx, &cqe))) {
-		trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
-
 		WRITE_ONCE(cqe->user_data, user_data);
 		WRITE_ONCE(cqe->res, res);
 		WRITE_ONCE(cqe->flags, cflags);
@@ -838,6 +836,8 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
 			WRITE_ONCE(cqe->big_cqe[0], 0);
 			WRITE_ONCE(cqe->big_cqe[1], 0);
 		}
+
+		trace_io_uring_complete(ctx, NULL, cqe);
 		return true;
 	}
 	return false;
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 70b6675941ff..9cd9a127e9ed 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -189,16 +189,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx,
 	if (unlikely(!io_get_cqe(ctx, &cqe)))
 		return false;
 
-	if (trace_io_uring_complete_enabled())
-		trace_io_uring_complete(req->ctx, req, req->cqe.user_data,
-					req->cqe.res, req->cqe.flags,
-					req->big_cqe.extra1, req->big_cqe.extra2);
 
 	memcpy(cqe, &req->cqe, sizeof(*cqe));
 	if (ctx->flags & IORING_SETUP_CQE32) {
 		memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe));
 		memset(&req->big_cqe, 0, sizeof(req->big_cqe));
 	}
+
+	if (trace_io_uring_complete_enabled())
+		trace_io_uring_complete(req->ctx, req, cqe);
 	return true;
 }
 
-- 
cgit v1.2.3


From e6d43739d0ee49a39505d696ba6a656f47c2bd39 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 16 Oct 2024 15:54:06 -0600
Subject: io_uring: kill 'imu' from struct io_kiocb

It's no longer being used, remove it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 391087144666..6d3ee71bd832 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -613,9 +613,6 @@ struct io_kiocb {
 	struct task_struct		*task;
 
 	union {
-		/* store used ubuf, so we can prevent reloading */
-		struct io_mapped_ubuf	*imu;
-
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
 		struct io_buffer	*kbuf;
 
-- 
cgit v1.2.3


From 79cfe9e59c2a12c3b3faeeefe38d23f3d8030972 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 21 Oct 2024 13:34:10 -0600
Subject: io_uring/register: add IORING_REGISTER_RESIZE_RINGS

Once a ring has been created, the size of the CQ and SQ rings are fixed.
Usually this isn't a problem on the SQ ring side, as it merely controls
the available number of requests that can be submitted in a single
system call, and there's rarely a need to change that.

For the CQ ring, it's a different story. For most efficient use of
io_uring, it's important that the CQ ring never overflows. This means
that applications must size it for the worst case scenario, which can
be wasteful.

Add IORING_REGISTER_RESIZE_RINGS, which allows an application to resize
the existing rings. It takes a struct io_uring_params argument, the same
one which is used to setup the ring initially, and resizes rings
according to the sizes given.

Certain properties are always inherited from the original ring setup,
like SQE128/CQE32 and other setup options. The implementation only
allows flag associated with how the CQ ring is sized and clamped.

Existing unconsumed SQE and CQE entries are copied as part of the
process. If either the SQ or CQ resized destination ring cannot hold the
entries already present in the source rings, then the operation is failed
with -EOVERFLOW. Any register op holds ->uring_lock, which prevents new
submissions, and the internal mapping holds the completion lock as well
across moving CQ ring state.

To prevent races between mmap and ring resizing, add a mutex that's
solely used to serialize ring resize and mmap. mmap_sem can't be used
here, as as fork'ed process may be doing mmaps on the ring as well.
The ctx->resize_lock is held across mmap operations, and the resize
will grab it before swapping out the already mapped new data.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |   7 ++
 include/uapi/linux/io_uring.h  |   5 +
 io_uring/io_uring.c            |   1 +
 io_uring/memmap.c              |   8 ++
 io_uring/register.c            | 215 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 6d3ee71bd832..841579dcdae9 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -415,6 +415,13 @@ struct io_ring_ctx {
 	/* protected by ->completion_lock */
 	unsigned			evfd_last_cq_tail;
 
+	/*
+	 * Protection for resize vs mmap races - both the mmap and resize
+	 * side will need to grab this lock, to prevent either side from
+	 * being run concurrently with the other.
+	 */
+	struct mutex			resize_lock;
+
 	/*
 	 * If IORING_SETUP_NO_MMAP is used, then the below holds
 	 * the gup'ed pages for the two rings, and the sqes.
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 86cb385fe0b5..60b9c98595fa 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -615,6 +615,11 @@ enum io_uring_register_op {
 	/* send MSG_RING without having a ring */
 	IORING_REGISTER_SEND_MSG_RING		= 31,
 
+	/* 32 reserved for zc rx */
+
+	/* resize CQ ring */
+	IORING_REGISTER_RESIZE_RINGS		= 33,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b5974bdad48b..140cd47fbdb3 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -353,6 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
 	INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
 	io_napi_init(ctx);
+	mutex_init(&ctx->resize_lock);
 
 	return ctx;
 
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index d614824e17bd..85c66fa54956 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -251,6 +251,8 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned int npages;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
@@ -274,6 +276,7 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = filp->private_data;
 	void *ptr;
 
 	/*
@@ -284,6 +287,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr)
 		return -EINVAL;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(filp, pgoff, len);
 	if (IS_ERR(ptr))
 		return -ENOMEM;
@@ -329,8 +334,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long len, unsigned long pgoff,
 					 unsigned long flags)
 {
+	struct io_ring_ctx *ctx = file->private_data;
 	void *ptr;
 
+	guard(mutex)(&ctx->resize_lock);
+
 	ptr = io_uring_validate_mmap_request(file, pgoff, len);
 	if (IS_ERR(ptr))
 		return PTR_ERR(ptr);
diff --git a/io_uring/register.c b/io_uring/register.c
index 52b2f9b74af8..fc6c94d694b2 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -29,6 +29,7 @@
 #include "napi.h"
 #include "eventfd.h"
 #include "msg_ring.h"
+#include "memmap.h"
 
 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
 				 IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -361,6 +362,214 @@ static int io_register_clock(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+/*
+ * State to maintain until we can swap. Both new and old state, used for
+ * either mapping or freeing.
+ */
+struct io_ring_ctx_rings {
+	unsigned short n_ring_pages;
+	unsigned short n_sqe_pages;
+	struct page **ring_pages;
+	struct page **sqe_pages;
+	struct io_uring_sqe *sq_sqes;
+	struct io_rings *rings;
+};
+
+static void io_register_free_rings(struct io_uring_params *p,
+				   struct io_ring_ctx_rings *r)
+{
+	if (!(p->flags & IORING_SETUP_NO_MMAP)) {
+		io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages,
+				true);
+		io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages,
+				true);
+	} else {
+		io_pages_free(&r->ring_pages, r->n_ring_pages);
+		io_pages_free(&r->sqe_pages, r->n_sqe_pages);
+		vunmap(r->rings);
+		vunmap(r->sq_sqes);
+	}
+}
+
+#define swap_old(ctx, o, n, field)		\
+	do {					\
+		(o).field = (ctx)->field;	\
+		(ctx)->field = (n).field;	\
+	} while (0)
+
+#define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
+#define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
+			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
+
+static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
+	size_t size, sq_array_offset;
+	struct io_uring_params p;
+	unsigned i, tail;
+	void *ptr;
+	int ret;
+
+	/* for single issuer, must be owner resizing */
+	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
+	    current != ctx->submitter_task)
+		return -EEXIST;
+	if (copy_from_user(&p, arg, sizeof(p)))
+		return -EFAULT;
+	if (p.flags & ~RESIZE_FLAGS)
+		return -EINVAL;
+
+	/* properties that are always inherited */
+	p.flags |= (ctx->flags & COPY_FLAGS);
+
+	ret = io_uring_fill_params(p.sq_entries, &p);
+	if (unlikely(ret))
+		return ret;
+
+	/* nothing to do, but copy params back */
+	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
+		if (copy_to_user(arg, &p, sizeof(p)))
+			return -EFAULT;
+		return 0;
+	}
+
+	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
+				&sq_array_offset);
+	if (size == SIZE_MAX)
+		return -EOVERFLOW;
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size);
+	else
+		n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages,
+						p.cq_off.user_addr, size);
+	if (IS_ERR(n.rings))
+		return PTR_ERR(n.rings);
+
+	n.rings->sq_ring_mask = p.sq_entries - 1;
+	n.rings->cq_ring_mask = p.cq_entries - 1;
+	n.rings->sq_ring_entries = p.sq_entries;
+	n.rings->cq_ring_entries = p.cq_entries;
+
+	if (copy_to_user(arg, &p, sizeof(p))) {
+		io_register_free_rings(&p, &n);
+		return -EFAULT;
+	}
+
+	if (p.flags & IORING_SETUP_SQE128)
+		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
+	else
+		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
+	if (size == SIZE_MAX) {
+		io_register_free_rings(&p, &n);
+		return -EOVERFLOW;
+	}
+
+	if (!(p.flags & IORING_SETUP_NO_MMAP))
+		ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size);
+	else
+		ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages,
+					p.sq_off.user_addr,
+					size);
+	if (IS_ERR(ptr)) {
+		io_register_free_rings(&p, &n);
+		return PTR_ERR(ptr);
+	}
+
+	/*
+	 * If using SQPOLL, park the thread
+	 */
+	if (ctx->sq_data) {
+		mutex_unlock(&ctx->uring_lock);
+		io_sq_thread_park(ctx->sq_data);
+		mutex_lock(&ctx->uring_lock);
+	}
+
+	/*
+	 * We'll do the swap. Grab the ctx->resize_lock, which will exclude
+	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
+	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
+	 * existing rings beyond this point will fail. Not that it could proceed
+	 * at this point anyway, as the io_uring mmap side needs go grab the
+	 * ctx->resize_lock as well. Likewise, hold the completion lock over the
+	 * duration of the actual swap.
+	 */
+	mutex_lock(&ctx->resize_lock);
+	spin_lock(&ctx->completion_lock);
+	o.rings = ctx->rings;
+	ctx->rings = NULL;
+	o.sq_sqes = ctx->sq_sqes;
+	ctx->sq_sqes = NULL;
+
+	/*
+	 * Now copy SQ and CQ entries, if any. If either of the destination
+	 * rings can't hold what is already there, then fail the operation.
+	 */
+	n.sq_sqes = ptr;
+	tail = o.rings->sq.tail;
+	if (tail - o.rings->sq.head > p.sq_entries)
+		goto overflow;
+	for (i = o.rings->sq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->sq_entries - 1);
+		unsigned dst_head = i & n.rings->sq_ring_mask;
+
+		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+	}
+	n.rings->sq.head = o.rings->sq.head;
+	n.rings->sq.tail = o.rings->sq.tail;
+
+	tail = o.rings->cq.tail;
+	if (tail - o.rings->cq.head > p.cq_entries) {
+overflow:
+		/* restore old rings, and return -EOVERFLOW via cleanup path */
+		ctx->rings = o.rings;
+		ctx->sq_sqes = o.sq_sqes;
+		to_free = &n;
+		ret = -EOVERFLOW;
+		goto out;
+	}
+	for (i = o.rings->cq.head; i < tail; i++) {
+		unsigned src_head = i & (ctx->cq_entries - 1);
+		unsigned dst_head = i & n.rings->cq_ring_mask;
+
+		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+	}
+	n.rings->cq.head = o.rings->cq.head;
+	n.rings->cq.tail = o.rings->cq.tail;
+	/* invalidate cached cqe refill */
+	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
+
+	n.rings->sq_dropped = o.rings->sq_dropped;
+	n.rings->sq_flags = o.rings->sq_flags;
+	n.rings->cq_flags = o.rings->cq_flags;
+	n.rings->cq_overflow = o.rings->cq_overflow;
+
+	/* all done, store old pointers and assign new ones */
+	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
+		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
+
+	ctx->sq_entries = p.sq_entries;
+	ctx->cq_entries = p.cq_entries;
+
+	ctx->rings = n.rings;
+	ctx->sq_sqes = n.sq_sqes;
+	swap_old(ctx, o, n, n_ring_pages);
+	swap_old(ctx, o, n, n_sqe_pages);
+	swap_old(ctx, o, n, ring_pages);
+	swap_old(ctx, o, n, sqe_pages);
+	to_free = &o;
+	ret = 0;
+out:
+	spin_unlock(&ctx->completion_lock);
+	mutex_unlock(&ctx->resize_lock);
+	io_register_free_rings(&p, to_free);
+
+	if (ctx->sq_data)
+		io_sq_thread_unpark(ctx->sq_data);
+
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -549,6 +758,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_clone_buffers(ctx, arg);
 		break;
+	case IORING_REGISTER_RESIZE_RINGS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_resize_rings(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From aa00f67adc2c0d6439f81b5a81ff181377c47a7e Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 22 Oct 2024 13:47:00 -0600
Subject: io_uring: add support for fixed wait regions

Generally applications have 1 or a few waits of waiting, yet they pass
in a struct io_uring_getevents_arg every time. This needs to get copied
and, in turn, the timeout value needs to get copied.

Rather than do this for every invocation, allow the application to
register a fixed set of wait regions that can simply be indexed when
asking the kernel to wait on events.

At ring setup time, the application can register a number of these wait
regions and initialize region/index 0 upfront:

	struct io_uring_reg_wait *reg;

	reg = io_uring_setup_reg_wait(ring, nr_regions, &ret);

	/* set timeout and mark as set, sigmask/sigmask_sz as needed */
	reg->ts.tv_sec = 0;
	reg->ts.tv_nsec = 100000;
	reg->flags = IORING_REG_WAIT_TS;

where nr_regions >= 1 && nr_regions <= PAGE_SIZE / sizeof(*reg). The
above initializes index 0, but 63 other regions can be initialized,
if needed. Now, instead of doing:

	struct __kernel_timespec timeout = { .tv_nsec = 100000, };

	io_uring_submit_and_wait_timeout(ring, &cqe, nr, &t, NULL);

to wait for events for each submit_and_wait, or just wait, operation, it
can just reference the above region at offset 0 and do:

	io_uring_submit_and_wait_reg(ring, &cqe, nr, 0);

to achieve the same goal of waiting 100usec without needing to copy
both struct io_uring_getevents_arg (24b) and struct __kernel_timeout
(16b) for each invocation. Struct io_uring_reg_wait looks as follows:

struct io_uring_reg_wait {
	struct __kernel_timespec	ts;
	__u32				min_wait_usec;
	__u32				flags;
	__u64				sigmask;
	__u32				sigmask_sz;
	__u32				pad[3];
	__u64				pad2[2];
};

embedding the timeout itself in the region, rather than passing it as
a pointer as well. Note that the signal mask is still passed as a
pointer, both for compatability reasons, but also because there doesn't
seem to be a lot of high frequency waits scenarios that involve setting
and resetting the signal mask for each wait.

The application is free to modify any region before a wait call, or it
can use keep multiple regions with different settings to avoid needing to
modify the same one for wait calls. Up to a page size of regions is mapped
by default, allowing PAGE_SIZE / 64 available regions for use.

The registered region must fit within a page. On a 4kb page size system,
that allows for 64 wait regions if a full page is used, as the size of
struct io_uring_reg_wait is 64b. The region registered must be aligned
to io_uring_reg_wait in size. It's valid to register less than 64
entries.

In network performance testing with zero-copy, this reduced the time
spent waiting on the TX side from 3.12% to 0.3% and the RX side from 4.4%
to 0.3%.

Wait regions are fixed for the lifetime of the ring - once registered,
they are persistent until the ring is torn down. The regions support
minimum wait timeout as well as the regular waits.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ++++++
 include/uapi/linux/io_uring.h  | 41 +++++++++++++++++++++
 io_uring/io_uring.c            | 68 +++++++++++++++++++++++++++++------
 io_uring/register.c            | 82 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/register.h            |  1 +
 5 files changed, 191 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 841579dcdae9..2f12828b22a4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -327,6 +327,14 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
+
+		/*
+		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
+		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
+		 * is the maximum allowable index.
+		 */
+		struct io_uring_reg_wait	*cq_wait_arg;
+		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -430,6 +438,8 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 60b9c98595fa..65b7417c1b05 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -518,6 +518,7 @@ struct io_cqring_offsets {
 #define IORING_ENTER_EXT_ARG		(1U << 3)
 #define IORING_ENTER_REGISTERED_RING	(1U << 4)
 #define IORING_ENTER_ABS_TIMER		(1U << 5)
+#define IORING_ENTER_EXT_ARG_REG	(1U << 6)
 
 /*
  * Passed in for io_uring_setup(2). Copied back with updated info on success
@@ -620,6 +621,9 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	/* register fixed io_uring_reg_wait arguments */
+	IORING_REGISTER_CQWAIT_REG		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -803,6 +807,43 @@ enum io_uring_register_restriction_op {
 	IORING_RESTRICTION_LAST
 };
 
+enum {
+	IORING_REG_WAIT_TS		= (1U << 0),
+};
+
+/*
+ * Argument for IORING_REGISTER_CQWAIT_REG, registering a region of
+ * struct io_uring_reg_wait that can be indexed when io_uring_enter(2) is
+ * called rather than pass in a wait argument structure separately.
+ */
+struct io_uring_cqwait_reg_arg {
+	__u32		flags;
+	__u32		struct_size;
+	__u32		nr_entries;
+	__u32		pad;
+	__u64		user_addr;
+	__u64		pad2[3];
+};
+
+/*
+ * Argument for io_uring_enter(2) with
+ * IORING_GETEVENTS | IORING_ENTER_EXT_ARG_REG set, where the actual argument
+ * is an index into a previously registered fixed wait region described by
+ * the below structure.
+ */
+struct io_uring_reg_wait {
+	struct __kernel_timespec	ts;
+	__u32				min_wait_usec;
+	__u32				flags;
+	__u64				sigmask;
+	__u32				sigmask_sz;
+	__u32				pad[3];
+	__u64				pad2[2];
+};
+
+/*
+ * Argument for io_uring_enter(2) with IORING_GETEVENTS | IORING_ENTER_EXT_ARG
+ */
 struct io_uring_getevents_arg {
 	__u64	sigmask;
 	__u32	sigmask_sz;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4cd0ee52710d..2863b957e373 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2736,6 +2736,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3224,21 +3225,43 @@ void __io_uring_cancel(bool cancel_all)
 	io_uring_cancel_generic(cancel_all, NULL);
 }
 
-static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
+static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
+			const struct io_uring_getevents_arg __user *uarg)
 {
-	if (flags & IORING_ENTER_EXT_ARG) {
-		struct io_uring_getevents_arg arg;
+	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
 
-		if (argsz != sizeof(arg))
+	if (arg) {
+		unsigned int index = (unsigned int) (uintptr_t) uarg;
+
+		if (index <= ctx->cq_wait_index)
+			return arg + index;
+	}
+
+	return ERR_PTR(-EFAULT);
+}
+
+static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			       const void __user *argp, size_t argsz)
+{
+	struct io_uring_getevents_arg arg;
+
+	if (!(flags & IORING_ENTER_EXT_ARG))
+		return 0;
+
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		if (argsz != sizeof(struct io_uring_reg_wait))
 			return -EINVAL;
-		if (copy_from_user(&arg, argp, sizeof(arg)))
-			return -EFAULT;
+		return PTR_ERR(io_get_ext_arg_reg(ctx, argp));
 	}
+	if (argsz != sizeof(arg))
+		return -EINVAL;
+	if (copy_from_user(&arg, argp, sizeof(arg)))
+		return -EFAULT;
 	return 0;
 }
 
-static int io_get_ext_arg(unsigned flags, const void __user *argp,
-			  struct ext_arg *ext_arg)
+static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
+			  const void __user *argp, struct ext_arg *ext_arg)
 {
 	const struct io_uring_getevents_arg __user *uarg = argp;
 	struct io_uring_getevents_arg arg;
@@ -3252,6 +3275,28 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp,
 		return 0;
 	}
 
+	if (flags & IORING_ENTER_EXT_ARG_REG) {
+		struct io_uring_reg_wait *w;
+
+		if (ext_arg->argsz != sizeof(struct io_uring_reg_wait))
+			return -EINVAL;
+		w = io_get_ext_arg_reg(ctx, argp);
+		if (IS_ERR(w))
+			return PTR_ERR(w);
+
+		if (w->flags & ~IORING_REG_WAIT_TS)
+			return -EINVAL;
+		ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC;
+		ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask));
+		ext_arg->argsz = READ_ONCE(w->sigmask_sz);
+		if (w->flags & IORING_REG_WAIT_TS) {
+			ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec);
+			ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec);
+			ext_arg->ts_set = true;
+		}
+		return 0;
+	}
+
 	/*
 	 * EXT_ARG is set - ensure we agree on the size of it and copy in our
 	 * timespec and sigset_t pointers if good.
@@ -3297,7 +3342,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
 			       IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
 			       IORING_ENTER_REGISTERED_RING |
-			       IORING_ENTER_ABS_TIMER)))
+			       IORING_ENTER_ABS_TIMER |
+			       IORING_ENTER_EXT_ARG_REG)))
 		return -EINVAL;
 
 	/*
@@ -3380,7 +3426,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			 */
 			mutex_lock(&ctx->uring_lock);
 iopoll_locked:
-			ret2 = io_validate_ext_arg(flags, argp, argsz);
+			ret2 = io_validate_ext_arg(ctx, flags, argp, argsz);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
@@ -3390,7 +3436,7 @@ iopoll_locked:
 		} else {
 			struct ext_arg ext_arg = { .argsz = argsz };
 
-			ret2 = io_get_ext_arg(flags, argp, &ext_arg);
+			ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg);
 			if (likely(!ret2)) {
 				min_complete = min(min_complete,
 						   ctx->cq_entries);
diff --git a/io_uring/register.c b/io_uring/register.c
index fc6c94d694b2..1eb686eaa310 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,82 @@ out:
 	return ret;
 }
 
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
+{
+	unsigned short npages = 1;
+
+	if (!ctx->cq_wait_page)
+		return;
+
+	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
+	ctx->cq_wait_arg = NULL;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+}
+
+/*
+ * Register a page holding N entries of struct io_uring_reg_wait, which can
+ * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
+ * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
+ * in a pointer for a struct io_uring_getevents_arg, an index into this
+ * registered array is passed, avoiding two (arg + timeout) copies per
+ * invocation.
+ */
+static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_cqwait_reg_arg arg;
+	struct io_uring_reg_wait *reg;
+	struct page **pages;
+	unsigned long len;
+	int nr_pages, poff;
+	int ret;
+
+	if (ctx->cq_wait_page || ctx->cq_wait_arg)
+		return -EBUSY;
+	if (copy_from_user(&arg, uarg, sizeof(arg)))
+		return -EFAULT;
+	if (!arg.nr_entries || arg.flags)
+		return -EINVAL;
+	if (arg.struct_size != sizeof(*reg))
+		return -EINVAL;
+	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
+		return -EOVERFLOW;
+	if (len > PAGE_SIZE)
+		return -EINVAL;
+	/* offset + len must fit within a page, and must be reg_wait aligned */
+	poff = arg.user_addr & ~PAGE_MASK;
+	if (len + poff > PAGE_SIZE)
+		return -EINVAL;
+	if (poff % arg.struct_size)
+		return -EINVAL;
+
+	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ret = -EINVAL;
+	if (nr_pages != 1)
+		goto out_free;
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, 1);
+		if (ret)
+			goto out_free;
+	}
+
+	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
+	if (reg) {
+		ctx->cq_wait_index = arg.nr_entries - 1;
+		WRITE_ONCE(ctx->cq_wait_page, pages);
+		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
+		return 0;
+	}
+	ret = -ENOMEM;
+	if (ctx->user)
+		__io_unaccount_mem(ctx->user, 1);
+out_free:
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +840,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_CQWAIT_REG:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_cqwait_reg(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index a5f39d5ef9e0..3e935e8fa4b2 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,5 +5,6 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
+void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From ff1256b8f3c45f222bce19fbfc1e1bc498b31d03 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 25 Oct 2024 08:54:28 -0600
Subject: io_uring/rsrc: move struct io_fixed_file to rsrc.h header

There's no need for this internal structure to be visible, move it to
the private rsrc.h header instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 5 -----
 io_uring/filetable.h           | 1 +
 io_uring/rsrc.h                | 5 +++++
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 2f12828b22a4..d4ba4ae480d6 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,11 +55,6 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
-};
-
 struct io_file_table {
 	struct io_fixed_file *files;
 	unsigned long *bitmap;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index b2435c4dca1f..c027ed4ad68d 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -4,6 +4,7 @@
 
 #include <linux/file.h>
 #include <linux/io_uring_types.h>
+#include "rsrc.h"
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index c50d4be4aa6d..e072fb3ee351 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -40,6 +40,11 @@ struct io_rsrc_node {
 	struct io_rsrc_put		item;
 };
 
+struct io_fixed_file {
+	/* file * with additional FFS_* flags */
+	unsigned long file_ptr;
+};
+
 struct io_mapped_ubuf {
 	u64		ubuf;
 	unsigned int	len;
-- 
cgit v1.2.3


From 7029acd8a950393ee3a3d8e1a7ee1a9b77808a3b Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 25 Oct 2024 19:27:39 -0600
Subject: io_uring/rsrc: get rid of per-ring io_rsrc_node list

Work in progress, but get rid of the per-ring serialization of resource
nodes, like registered buffers and files. Main issue here is that one
node can otherwise hold up a bunch of other nodes from getting freed,
which is especially a problem for file resource nodes and networked
workloads where some descriptors may not see activity in a long time.

As an example, instantiate an io_uring ring fd and create a sparse
registered file table. Even 2 will do. Then create a socket and register
it as fixed file 0, F0. The number of open files in the app is now 5,
with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4
being the socket. Register this socket (eg "the listener") in slot 0 of
the registered file table. Now add an operation on the socket that uses
slot 0. Finally, loop N times, where each loop creates a new socket,
registers said socket as a file, then unregisters the socket, and
finally closes the socket. This is roughly similar to what a basic
accept loop would look like.

At the end of this loop, it's not unreasonable to expect that there
would still be 5 open files. Each socket created and registered in the
loop is also unregistered and closed. But since the listener socket
registered first still has references to its resource node due to still
being active, each subsequent socket unregistration is stuck behind it
for reclaim. Hence 5 + N files are still open at that point, where N is
awaiting the final put held up by the listener socket.

Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct
io_kiocb now gets explicit resource nodes assigned, with each holding a
reference to the parent node. A parent node is either of type FILE or
BUFFER, which are the two types of nodes that exist. A request can have
two nodes assigned, if it's using both registered files and buffers.
Since request issue and task_work completion is both under the ring
private lock, no atomics are needed to handle these references. It's a
simple unlocked inc/dec. As before, the registered buffer or file table
each hold a reference as well to the registered nodes. Final put of the
node will remove the node and free the underlying resource, eg unmap the
buffer or put the file.

Outside of removing the stall in resource reclaim described above, it
has the following advantages:

1) It's a lot simpler than the previous scheme, and easier to follow.
   No need to specific quiesce handling anymore.

2) There are no resource node allocations in the fast path, all of that
   happens at resource registration time.

3) The structs related to resource handling can all get simplified
   quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can
   go away completely.

4) Handling of resource tags is much simpler, and doesn't require
   persistent storage as it can simply get assigned up front at
   registration time. Just copy them in one-by-one at registration time
   and assign to the resource node.

The only real downside is that a request is now explicitly limited to
pinning 2 resources, one file and one buffer, where before just
assigning a resource node to a request would pin all of them. The upside
is that it's easier to follow now, as an individual resource is
explicitly referenced and assigned to the request.

With this in place, the above mentioned example will be using exactly 5
files at the end of the loop, not N.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  10 +-
 io_uring/fdinfo.c              |   2 +-
 io_uring/filetable.c           |  52 ++---
 io_uring/filetable.h           |  25 +--
 io_uring/io_uring.c            |  38 +---
 io_uring/net.c                 |  11 +-
 io_uring/nop.c                 |   6 +-
 io_uring/notif.c               |   3 +-
 io_uring/rsrc.c                | 481 +++++++++++++++--------------------------
 io_uring/rsrc.h                |  71 ++----
 io_uring/rw.c                  |   8 +-
 io_uring/splice.c              |  16 +-
 io_uring/uring_cmd.c           |  12 +-
 13 files changed, 270 insertions(+), 465 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d4ba4ae480d6..42c5f2c992c4 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -56,7 +56,7 @@ struct io_wq_work {
 };
 
 struct io_file_table {
-	struct io_fixed_file *files;
+	struct io_rsrc_node **nodes;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -264,7 +264,6 @@ struct io_ring_ctx {
 		 * Fixed resources fast path, should be accessed only under
 		 * uring_lock, and updated through io_uring_register(2)
 		 */
-		struct io_rsrc_node	*rsrc_node;
 		atomic_t		cancel_seq;
 
 		/*
@@ -277,7 +276,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_mapped_ubuf	**user_bufs;
+		struct io_rsrc_node	**user_bufs;
 		unsigned		nr_user_files;
 		unsigned		nr_user_bufs;
 
@@ -372,10 +371,7 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*buf_data;
 
 	/* protected by ->uring_lock */
-	struct list_head		rsrc_ref_list;
 	struct io_alloc_cache		rsrc_node_cache;
-	struct wait_queue_head		rsrc_quiesce_wq;
-	unsigned			rsrc_quiesce;
 
 	u32			pers_next;
 	struct xarray		personalities;
@@ -642,7 +638,7 @@ struct io_kiocb {
 		__poll_t apoll_events;
 	};
 
-	struct io_rsrc_node		*rsrc_node;
+	struct io_rsrc_node		*rsrc_nodes[2];
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index a6bac533edbe..064a79475c5f 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -176,7 +176,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	}
 	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
 	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i];
+		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 997c56d32ee6..1b12a9a1cc16 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,14 +38,14 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->files = kvcalloc(nr_files, sizeof(table->files[0]),
-				GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->files))
+	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (unlikely(!table->nodes))
 		return false;
 
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
 	if (unlikely(!table->bitmap)) {
-		kvfree(table->files);
+		kvfree(table->nodes);
 		return false;
 	}
 
@@ -54,9 +54,9 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->files);
+	kvfree(table->nodes);
 	bitmap_free(table->bitmap);
-	table->files = NULL;
+	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -64,8 +64,7 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 				 u32 slot_index)
 	__must_hold(&req->ctx->uring_lock)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
+	struct io_rsrc_node *node;
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
@@ -74,22 +73,18 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	if (slot_index >= ctx->nr_user_files)
 		return -EINVAL;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
-
-	if (file_slot->file_ptr) {
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-					    io_slot_file(file_slot));
-		if (ret)
-			return ret;
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+	if (!node)
+		return -ENOMEM;
 
-		file_slot->file_ptr = 0;
-	} else {
+	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
+	if (ctx->file_table.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
-	}
 
-	*io_get_tag_slot(ctx->file_data, slot_index) = 0;
-	io_fixed_file_set(file_slot, file);
+	ctx->file_table.nodes[slot_index] = node;
+	io_fixed_file_set(node, file);
 	return 0;
 }
 
@@ -134,25 +129,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	struct io_fixed_file *file_slot;
-	int ret;
-
 	if (unlikely(!ctx->file_data))
 		return -ENXIO;
 	if (offset >= ctx->nr_user_files)
 		return -EINVAL;
 
 	offset = array_index_nospec(offset, ctx->nr_user_files);
-	file_slot = io_fixed_file_slot(&ctx->file_table, offset);
-	if (!file_slot->file_ptr)
+	if (!ctx->file_table.nodes[offset])
 		return -EBADF;
-
-	ret = io_queue_rsrc_removal(ctx->file_data, offset,
-				    io_slot_file(file_slot));
-	if (ret)
-		return ret;
-
-	file_slot->file_ptr = 0;
+	io_put_rsrc_node(ctx->file_table.nodes[offset]);
+	ctx->file_table.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index c027ed4ad68d..47616079abaa 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -34,36 +34,35 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
 	table->alloc_hint = bit + 1;
 }
 
-static inline struct io_fixed_file *
-io_fixed_file_slot(struct io_file_table *table, unsigned i)
-{
-	return &table->files[i];
-}
-
 #define FFS_NOWAIT		0x1UL
 #define FFS_ISREG		0x2UL
 #define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
 
-static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+static inline unsigned int io_slot_flags(struct io_rsrc_node *node)
 {
-	return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+
+	return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
 }
 
-static inline struct file *io_slot_file(struct io_fixed_file *slot)
+static inline struct file *io_slot_file(struct io_rsrc_node *node)
 {
-	return (struct file *)(slot->file_ptr & FFS_MASK);
+	return (struct file *)(node->file_ptr & FFS_MASK);
 }
 
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	return io_slot_file(io_fixed_file_slot(table, index));
+	struct io_rsrc_node *node = table->nodes[index];
+
+	if (node)
+		return io_slot_file(node);
+	return NULL;
 }
 
-static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
+static inline void io_fixed_file_set(struct io_rsrc_node *node,
 				     struct file *file)
 {
-	file_slot->file_ptr = (unsigned long)file |
+	node->file_ptr = (unsigned long)file |
 		(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
 }
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a09c67b38c1b..0876aa74c739 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -333,7 +333,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->cq_wait);
 	init_waitqueue_head(&ctx->poll_wq);
-	init_waitqueue_head(&ctx->rsrc_quiesce_wq);
 	spin_lock_init(&ctx->completion_lock);
 	spin_lock_init(&ctx->timeout_lock);
 	INIT_WQ_LIST(&ctx->iopoll_list);
@@ -341,7 +340,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	INIT_LIST_HEAD(&ctx->ltimeout_list);
-	INIT_LIST_HEAD(&ctx->rsrc_ref_list);
 	init_llist_head(&ctx->work_llist);
 	INIT_LIST_HEAD(&ctx->tctx_list);
 	ctx->submit_state.free_list.next = NULL;
@@ -1415,7 +1413,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 				io_clean_op(req);
 		}
 		io_put_file(req);
-		io_put_rsrc_node(ctx, req->rsrc_node);
+		io_req_put_rsrc_nodes(req);
 		io_put_task(req->task);
 
 		node = req->comp_list.next;
@@ -1878,7 +1876,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 				      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
@@ -1886,11 +1884,12 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 		goto out;
 	fd = array_index_nospec(fd, ctx->nr_user_files);
-	slot = io_fixed_file_slot(&ctx->file_table, fd);
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	req->flags |= io_slot_flags(slot);
-	file = io_slot_file(slot);
+	node = ctx->file_table.nodes[fd];
+	if (node) {
+		io_req_assign_rsrc_node(req, node);
+		req->flags |= io_slot_flags(node);
+		file = io_slot_file(node);
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
@@ -2036,7 +2035,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->rsrc_node = NULL;
+	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 	req->task = current;
 	req->cancel_seq_set = false;
 
@@ -2718,15 +2718,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
 static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_sq_thread_finish(ctx);
-	/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
-	if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
-		return;
 
 	mutex_lock(&ctx->uring_lock);
-	if (ctx->buf_data)
-		__io_sqe_buffers_unregister(ctx);
-	if (ctx->file_data)
-		__io_sqe_files_unregister(ctx);
+	io_sqe_buffers_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	io_cqring_overflow_kill(ctx);
 	io_eventfd_unregister(ctx);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2743,11 +2738,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	if (ctx->submitter_task)
 		put_task_struct(ctx->submitter_task);
 
-	/* there are no registered resources left, nobody uses it */
-	if (ctx->rsrc_node)
-		io_rsrc_node_destroy(ctx, ctx->rsrc_node);
-
-	WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
 	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
@@ -3729,10 +3719,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
-	ret = io_rsrc_init(ctx);
-	if (ret)
-		goto err;
-
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
 			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
diff --git a/io_uring/net.c b/io_uring/net.c
index 2040195e33ab..ce1156551d10 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1342,15 +1342,15 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (sr->buf_index < ctx->nr_user_bufs) {
 			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(sr->notif, ctx);
+			node = ctx->user_bufs[idx];
+			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
@@ -1358,8 +1358,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 		if (unlikely(ret))
 			return ret;
 
-		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, imu,
-					(u64)(uintptr_t)sr->buf, sr->len);
+		ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
+					node->buf, (u64)(uintptr_t)sr->buf,
+					sr->len);
 		if (unlikely(ret))
 			return ret;
 		kmsg->msg.sg_from_iter = io_sg_from_iter;
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 2c7a22ba4053..de91600a3bc6 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -61,15 +61,15 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 	}
 	if (nop->flags & IORING_NOP_FIXED_BUFFER) {
 		struct io_ring_ctx *ctx = req->ctx;
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		int idx;
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
 		if (nop->buffer < ctx->nr_user_bufs) {
 			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			imu = READ_ONCE(ctx->user_bufs[idx]);
-			io_req_set_rsrc_node(req, ctx);
+			node = READ_ONCE(ctx->user_bufs[idx]);
+			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 28859ae3ee6e..4f02e969cf08 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,7 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_node = NULL;
+	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
+	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index ca2ec8a018be..88d698efd75b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -26,10 +26,8 @@ struct io_rsrc_update {
 	u32				offset;
 };
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc);
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage);
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+			struct iovec *iov, struct page **last_hpage);
 
 /* only define max */
 #define IORING_MAX_FIXED_FILES	(1U << 20)
@@ -110,13 +108,13 @@ static int io_buffer_validate(struct iovec *iov)
 	return 0;
 }
 
-static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot)
+static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
 {
-	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
-	*slot = NULL;
-	if (imu != &dummy_ubuf) {
+	if (node->buf != &dummy_ubuf) {
+		struct io_mapped_ubuf *imu = node->buf;
+
 		if (!refcount_dec_and_test(&imu->refs))
 			return;
 		for (i = 0; i < imu->nr_bvecs; i++)
@@ -127,205 +125,55 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 	}
 }
 
-static void io_rsrc_put_work(struct io_rsrc_node *node)
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
-	struct io_rsrc_put *prsrc = &node->item;
-
-	if (prsrc->tag)
-		io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0);
-
-	switch (node->type) {
-	case IORING_RSRC_FILE:
-		fput(prsrc->file);
-		break;
-	case IORING_RSRC_BUFFER:
-		io_rsrc_buf_put(node->ctx, prsrc);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		break;
-	}
-}
-
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
-{
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
-}
-
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node)
-	__must_hold(&node->ctx->uring_lock)
-{
-	struct io_ring_ctx *ctx = node->ctx;
+	struct io_rsrc_node *node;
 
-	while (!list_empty(&ctx->rsrc_ref_list)) {
-		node = list_first_entry(&ctx->rsrc_ref_list,
-					    struct io_rsrc_node, node);
-		/* recycle ref nodes in order */
-		if (node->refs)
-			break;
-		list_del(&node->node);
-
-		if (likely(!node->empty))
-			io_rsrc_put_work(node);
-		io_rsrc_node_destroy(ctx, node);
-	}
-	if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce))
-		wake_up_all(&ctx->rsrc_quiesce_wq);
-}
-
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *ref_node;
-
-	ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!ref_node) {
-		ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL);
-		if (!ref_node)
+	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
+	if (!node) {
+		node = kzalloc(sizeof(*node), GFP_KERNEL);
+		if (!node)
 			return NULL;
 	}
 
-	ref_node->ctx = ctx;
-	ref_node->empty = 0;
-	ref_node->refs = 1;
-	return ref_node;
-}
-
-__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data,
-				      struct io_ring_ctx *ctx)
-{
-	struct io_rsrc_node *backup;
-	DEFINE_WAIT(we);
-	int ret;
-
-	/* As We may drop ->uring_lock, other task may have started quiesce */
-	if (data->quiesce)
-		return -ENXIO;
-
-	backup = io_rsrc_node_alloc(ctx);
-	if (!backup)
-		return -ENOMEM;
-	ctx->rsrc_node->empty = true;
-	ctx->rsrc_node->type = -1;
-	list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, ctx->rsrc_node);
-	ctx->rsrc_node = backup;
-
-	if (list_empty(&ctx->rsrc_ref_list))
-		return 0;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 1);
-		smp_mb();
-	}
-
-	ctx->rsrc_quiesce++;
-	data->quiesce = true;
-	do {
-		prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE);
-		mutex_unlock(&ctx->uring_lock);
-
-		ret = io_run_task_work_sig(ctx);
-		if (ret < 0) {
-			finish_wait(&ctx->rsrc_quiesce_wq, &we);
-			mutex_lock(&ctx->uring_lock);
-			if (list_empty(&ctx->rsrc_ref_list))
-				ret = 0;
-			break;
-		}
-
-		schedule();
-		mutex_lock(&ctx->uring_lock);
-		ret = 0;
-	} while (!list_empty(&ctx->rsrc_ref_list));
-
-	finish_wait(&ctx->rsrc_quiesce_wq, &we);
-	data->quiesce = false;
-	ctx->rsrc_quiesce--;
-
-	if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
-		atomic_set(&ctx->cq_wait_nr, 0);
-		smp_mb();
-	}
-	return ret;
-}
-
-static void io_free_page_table(void **table, size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-
-	for (i = 0; i < nr_tables; i++)
-		kfree(table[i]);
-	kfree(table);
+	node->ctx = ctx;
+	node->refs = 1;
+	node->type = type;
+	return node;
 }
 
 static void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	size_t size = data->nr * sizeof(data->tags[0][0]);
-
-	if (data->tags)
-		io_free_page_table((void **)data->tags, size);
-	kfree(data);
-}
-
-static __cold void **io_alloc_page_table(size_t size)
-{
-	unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE);
-	size_t init_size = size;
-	void **table;
-
-	table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT);
-	if (!table)
-		return NULL;
+	int i;
 
-	for (i = 0; i < nr_tables; i++) {
-		unsigned int this_size = min_t(size_t, size, PAGE_SIZE);
+	for (i = 0; i < data->nr; i++) {
+		struct io_rsrc_node *node = data->nodes[i];
 
-		table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT);
-		if (!table[i]) {
-			io_free_page_table(table, init_size);
-			return NULL;
-		}
-		size -= this_size;
+		io_put_rsrc_node(node);
 	}
-	return table;
+	kvfree(data->nodes);
+	kfree(data);
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type,
-				     u64 __user *utags,
-				     unsigned nr, struct io_rsrc_data **pdata)
+__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
+				     struct io_rsrc_data **pdata)
 {
 	struct io_rsrc_data *data;
-	int ret = 0;
-	unsigned i;
 
 	data = kzalloc(sizeof(*data), GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
-	data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0]));
-	if (!data->tags) {
+
+	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
+	if (!data->nodes) {
 		kfree(data);
 		return -ENOMEM;
 	}
 
 	data->nr = nr;
-	data->ctx = ctx;
-	data->rsrc_type = type;
-	if (utags) {
-		ret = -EFAULT;
-		for (i = 0; i < nr; i++) {
-			u64 *tag_slot = io_get_tag_slot(data, i);
-
-			if (copy_from_user(tag_slot, &utags[i],
-					   sizeof(*tag_slot)))
-				goto fail;
-		}
-	}
 	*pdata = data;
 	return 0;
-fail:
-	io_rsrc_data_free(data);
-	return ret;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -334,8 +182,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 {
 	u64 __user *tags = u64_to_user_ptr(up->tags);
 	__s32 __user *fds = u64_to_user_ptr(up->data);
-	struct io_rsrc_data *data = ctx->file_data;
-	struct io_fixed_file *file_slot;
 	int fd, i, err = 0;
 	unsigned int done;
 
@@ -360,18 +206,14 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 			continue;
 
 		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-
-		if (file_slot->file_ptr) {
-			err = io_queue_rsrc_removal(data, i,
-						    io_slot_file(file_slot));
-			if (err)
-				break;
-			file_slot->file_ptr = 0;
+		if (ctx->file_table.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.nodes[i]);
+			ctx->file_table.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
 			struct file *file = fget(fd);
+			struct io_rsrc_node *node;
 
 			if (!file) {
 				err = -EBADF;
@@ -385,8 +227,15 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				err = -EBADF;
 				break;
 			}
-			*io_get_tag_slot(data, i) = tag;
-			io_fixed_file_set(file_slot, file);
+			node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+			if (!node) {
+				err = -ENOMEM;
+				fput(file);
+				break;
+			}
+			ctx->file_table.nodes[i] = node;
+			node->tag = tag;
+			io_fixed_file_set(node, file);
 			io_file_bitmap_set(&ctx->file_table, i);
 		}
 	}
@@ -411,7 +260,7 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
-		struct io_mapped_ubuf *imu;
+		struct io_rsrc_node *node;
 		u64 tag = 0;
 
 		uvec = u64_to_user_ptr(user_data);
@@ -431,23 +280,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage);
-		if (err)
-			break;
-
 		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i] != &dummy_ubuf) {
-			err = io_queue_rsrc_removal(ctx->buf_data, i,
-						    ctx->user_bufs[i]);
-			if (unlikely(err)) {
-				io_buffer_unmap(ctx, &imu);
-				break;
-			}
-			ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf;
+		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			break;
 		}
+		io_put_rsrc_node(ctx->user_bufs[i]);
 
-		ctx->user_bufs[i] = imu;
-		*io_get_tag_slot(ctx->buf_data, i) = tag;
+		ctx->user_bufs[i] = node;
+		node->tag = tag;
 		if (ctx->compat)
 			user_data += sizeof(struct compat_iovec);
 		else
@@ -622,38 +464,47 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
 	return IOU_OK;
 }
 
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc)
+void io_free_rsrc_node(struct io_rsrc_node *node)
 {
-	struct io_ring_ctx *ctx = data->ctx;
-	struct io_rsrc_node *node = ctx->rsrc_node;
-	u64 *tag_slot = io_get_tag_slot(data, idx);
+	struct io_ring_ctx *ctx = node->ctx;
 
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	if (unlikely(!ctx->rsrc_node)) {
-		ctx->rsrc_node = node;
-		return -ENOMEM;
+	lockdep_assert_held(&ctx->uring_lock);
+
+	if (node->tag)
+		io_post_aux_cqe(node->ctx, node->tag, 0, 0);
+
+	switch (node->type) {
+	case IORING_RSRC_FILE:
+		if (io_slot_file(node))
+			fput(io_slot_file(node));
+		break;
+	case IORING_RSRC_BUFFER:
+		if (node->buf)
+			io_buffer_unmap(node->ctx, node);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
 	}
 
-	node->item.rsrc = rsrc;
-	node->type = data->rsrc_type;
-	node->item.tag = *tag_slot;
-	*tag_slot = 0;
-	list_add_tail(&node->node, &ctx->rsrc_ref_list);
-	io_put_rsrc_node(ctx, node);
-	return 0;
+	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
+		kfree(node);
 }
 
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
 	int i;
 
+	lockdep_assert_held(&ctx->uring_lock);
+
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
+		struct io_rsrc_node *node = ctx->file_table.nodes[i];
 
-		if (!file)
-			continue;
-		io_file_bitmap_clear(&ctx->file_table, i);
-		fput(file);
+		if (node) {
+			io_put_rsrc_node(node);
+			io_file_bitmap_clear(&ctx->file_table, i);
+			ctx->file_table.nodes[i] = NULL;
+		}
 	}
 
 	io_free_file_tables(&ctx->file_table);
@@ -665,22 +516,11 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_files;
-	int ret;
-
 	if (!ctx->file_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_files = 0;
-	ret = io_rsrc_ref_quiesce(ctx->file_data, ctx);
-	ctx->nr_user_files = nr;
-	if (!ret)
-		__io_sqe_files_unregister(ctx);
-	return ret;
+	__io_sqe_files_unregister(ctx);
+	return 0;
 }
 
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
@@ -699,8 +539,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args,
-				 &ctx->file_data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
 	if (ret)
 		return ret;
 
@@ -711,16 +550,18 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	}
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
-		struct io_fixed_file *file_slot;
+		struct io_rsrc_node *node;
+		u64 tag = 0;
 
-		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) {
-			ret = -EFAULT;
+		ret = -EFAULT;
+		if (tags && copy_from_user(&tag, &tags[i], sizeof(tag)))
+			goto fail;
+		if (fds && copy_from_user(&fd, &fds[i], sizeof(fd)))
 			goto fail;
-		}
 		/* allow sparse sets */
 		if (!fds || fd == -1) {
 			ret = -EINVAL;
-			if (unlikely(*io_get_tag_slot(ctx->file_data, i)))
+			if (tag)
 				goto fail;
 			continue;
 		}
@@ -737,8 +578,16 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			fput(file);
 			goto fail;
 		}
-		file_slot = io_fixed_file_slot(&ctx->file_table, i);
-		io_fixed_file_set(file_slot, file);
+		ret = -ENOMEM;
+		node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+		if (!node) {
+			fput(file);
+			goto fail;
+		}
+		if (tag)
+			node->tag = tag;
+		ctx->file_table.nodes[i] = node;
+		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
@@ -750,43 +599,30 @@ fail:
 	return ret;
 }
 
-static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc)
-{
-	io_buffer_unmap(ctx, &prsrc->buf);
-	prsrc->buf = NULL;
-}
-
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
+static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
 	unsigned int i;
 
-	for (i = 0; i < ctx->nr_user_bufs; i++)
-		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
-	kfree(ctx->user_bufs);
-	io_rsrc_data_free(ctx->buf_data);
+	lockdep_assert_held(&ctx->uring_lock);
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		io_put_rsrc_node(ctx->user_bufs[i]);
+		ctx->user_bufs[i] = NULL;
+	}
+	kvfree(ctx->user_bufs);
 	ctx->user_bufs = NULL;
+	io_rsrc_data_free(ctx->buf_data);
 	ctx->buf_data = NULL;
 	ctx->nr_user_bufs = 0;
 }
 
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	unsigned nr = ctx->nr_user_bufs;
-	int ret;
-
 	if (!ctx->buf_data)
 		return -ENXIO;
 
-	/*
-	 * Quiesce may unlock ->uring_lock, and while it's not held
-	 * prevent new requests using the table.
-	 */
-	ctx->nr_user_bufs = 0;
-	ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx);
-	ctx->nr_user_bufs = nr;
-	if (!ret)
-		__io_sqe_buffers_unregister(ctx);
-	return ret;
+	__io_sqe_buffers_unregister(ctx);
+	return 0;
 }
 
 /*
@@ -813,7 +649,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 
 	/* check previously registered pages */
 	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		struct io_rsrc_node *node = ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
 			if (!PageCompound(imu->bvec[j].bv_page))
@@ -950,21 +787,28 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
 	return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
 }
 
-static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
-				  struct io_mapped_ubuf **pimu,
-				  struct page **last_hpage)
+static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
+						   struct iovec *iov,
+						   struct page **last_hpage)
 {
 	struct io_mapped_ubuf *imu = NULL;
 	struct page **pages = NULL;
+	struct io_rsrc_node *node;
 	unsigned long off;
 	size_t size;
 	int ret, nr_pages, i;
 	struct io_imu_folio_data data;
 	bool coalesced;
 
-	*pimu = (struct io_mapped_ubuf *)&dummy_ubuf;
-	if (!iov->iov_base)
-		return 0;
+	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+	if (!node)
+		return ERR_PTR(-ENOMEM);
+	node->buf = NULL;
+
+	if (!iov->iov_base) {
+		node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		return node;
+	}
 
 	ret = -ENOMEM;
 	pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len,
@@ -998,7 +842,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		imu->folio_shift = data.folio_shift;
 	refcount_set(&imu->refs, 1);
 	off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
-	*pimu = imu;
+	node->buf = imu;
 	ret = 0;
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1010,10 +854,14 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 		size -= vec_len;
 	}
 done:
-	if (ret)
+	if (ret) {
 		kvfree(imu);
+		if (node)
+			io_put_rsrc_node(node);
+		node = ERR_PTR(ret);
+	}
 	kvfree(pages);
-	return ret;
+	return node;
 }
 
 static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
@@ -1037,7 +885,7 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data);
+	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
 	if (ret)
 		return ret;
 	ret = io_buffers_map_alloc(ctx, nr_args);
@@ -1050,6 +898,9 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		memset(iov, 0, sizeof(*iov));
 
 	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+		struct io_rsrc_node *node;
+		u64 tag = 0;
+
 		if (arg) {
 			uvec = (struct iovec __user *) arg;
 			iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat);
@@ -1066,15 +917,24 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 				arg += sizeof(struct iovec);
 		}
 
-		if (!iov->iov_base && *io_get_tag_slot(data, i)) {
-			ret = -EINVAL;
-			break;
+		if (tags) {
+			if (copy_from_user(&tag, &tags[i], sizeof(tag))) {
+				ret = -EFAULT;
+				break;
+			}
+			if (tag && !iov->iov_base) {
+				ret = -EINVAL;
+				break;
+			}
 		}
 
-		ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i],
-					     &last_hpage);
-		if (ret)
+		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
+		if (IS_ERR(node)) {
+			ret = PTR_ERR(node);
 			break;
+		}
+		node->tag = tag;
+		ctx->user_bufs[i] = node;
 	}
 
 	WARN_ON_ONCE(ctx->buf_data);
@@ -1148,7 +1008,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_mapped_ubuf **user_bufs;
+	struct io_rsrc_node **user_bufs;
 	struct io_rsrc_data *data;
 	int i, ret, nbufs;
 
@@ -1163,21 +1023,31 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	nbufs = src_ctx->nr_user_bufs;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data);
+	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
 	if (ret)
 		goto out_unlock;
 
 	ret = -ENOMEM;
-	user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL);
+	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
+					GFP_KERNEL | __GFP_ZERO);
 	if (!user_bufs)
 		goto out_free_data;
 
 	for (i = 0; i < nbufs; i++) {
-		struct io_mapped_ubuf *src = src_ctx->user_bufs[i];
+		struct io_mapped_ubuf *imu = src_ctx->user_bufs[i]->buf;
+		struct io_rsrc_node *dst_node;
 
-		if (src != &dummy_ubuf)
-			refcount_inc(&src->refs);
-		user_bufs[i] = src;
+		dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+		if (!dst_node)
+			goto out_put_free;
+
+		if (imu == &dummy_ubuf) {
+			dst_node->buf = (struct io_mapped_ubuf *) &dummy_ubuf;
+		} else {
+			refcount_inc(&imu->refs);
+			dst_node->buf = imu;
+		}
+		user_bufs[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
@@ -1190,12 +1060,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 		return 0;
 	}
 
+	mutex_unlock(&ctx->uring_lock);
+	mutex_lock(&src_ctx->uring_lock);
 	/* someone raced setting up buffers, dump ours */
-	for (i = 0; i < nbufs; i++)
-		io_buffer_unmap(ctx, &user_bufs[i]);
-	io_rsrc_data_free(data);
-	kfree(user_bufs);
-	return -EBUSY;
+	ret = -EBUSY;
+	i = nbufs;
+out_put_free:
+	while (i--) {
+		io_buffer_unmap(src_ctx, user_bufs[i]);
+		kfree(user_bufs[i]);
+	}
+	kvfree(user_bufs);
 out_free_data:
 	io_rsrc_data_free(data);
 out_unlock:
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 1589c9740083..20a316854238 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,36 +13,21 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_put {
-	u64 tag;
-	union {
-		void *rsrc;
-		struct file *file;
-		struct io_mapped_ubuf *buf;
-	};
-};
-
 struct io_rsrc_data {
-	struct io_ring_ctx		*ctx;
-
-	u64				**tags;
 	unsigned int			nr;
-	u16				rsrc_type;
-	bool				quiesce;
+	struct io_rsrc_node		**nodes;
 };
 
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
-	bool				empty;
 	u16				type;
-	struct list_head		node;
-	struct io_rsrc_put		item;
-};
 
-struct io_fixed_file {
-	/* file * with additional FFS_* flags */
-	unsigned long file_ptr;
+	u64 tag;
+	union {
+		unsigned long file_ptr;
+		struct io_mapped_ubuf *buf;
+	};
 };
 
 struct io_mapped_ubuf {
@@ -63,21 +48,17 @@ struct io_imu_folio_data {
 	unsigned int	folio_shift;
 };
 
-void io_rsrc_node_ref_zero(struct io_rsrc_node *node);
-void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node);
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx);
-int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc);
+struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
+void io_free_rsrc_node(struct io_rsrc_node *node);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
 			   u64 buf_addr, size_t len);
 
 int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg);
-void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags);
-void __io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_unregister(struct io_ring_ctx *ctx);
 int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 			  unsigned nr_args, u64 __user *tags);
@@ -89,41 +70,23 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
 int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
 			unsigned int size, unsigned int type);
 
-static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
+static inline void io_put_rsrc_node(struct io_rsrc_node *node)
 {
-	lockdep_assert_held(&ctx->uring_lock);
-
 	if (node && !--node->refs)
-		io_rsrc_node_ref_zero(node);
-}
-
-static inline void __io_req_set_rsrc_node(struct io_kiocb *req,
-					  struct io_ring_ctx *ctx)
-{
-	lockdep_assert_held(&ctx->uring_lock);
-	req->rsrc_node = ctx->rsrc_node;
-	ctx->rsrc_node->refs++;
+		io_free_rsrc_node(node);
 }
 
-static inline void io_req_set_rsrc_node(struct io_kiocb *req,
-					struct io_ring_ctx *ctx)
+static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-}
-
-static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx)
-{
-	unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK;
-	unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT;
-
-	return &data->tags[table_idx][off];
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
+	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
 }
 
-static inline int io_rsrc_init(struct io_ring_ctx *ctx)
+static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
+					   struct io_rsrc_node *node)
 {
-	ctx->rsrc_node = io_rsrc_node_alloc(ctx);
-	return ctx->rsrc_node ? 0 : -ENOMEM;
+	node->refs++;
+	req->rsrc_nodes[node->type] = node;
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 8080ffd6d571..65491f4f2c7e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -330,7 +330,7 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 {
 	struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_mapped_ubuf *imu;
+	struct io_rsrc_node *node;
 	struct io_async_rw *io;
 	u16 index;
 	int ret;
@@ -342,11 +342,11 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
 		return -EFAULT;
 	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	imu = ctx->user_bufs[index];
-	io_req_set_rsrc_node(req, ctx);
+	node = ctx->user_bufs[index];
+	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
-	ret = io_import_fixed(ddir, &io->iter, imu, rw->addr, rw->len);
+	ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
 	iov_iter_save_state(&io->iter, &io->iter_state);
 	return ret;
 }
diff --git a/io_uring/splice.c b/io_uring/splice.c
index e62bc6497a94..a0b4e0435b8b 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -51,7 +51,7 @@ void io_splice_cleanup(struct io_kiocb *req)
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 
-	io_put_rsrc_node(req->ctx, sp->rsrc_node);
+	io_put_rsrc_node(sp->rsrc_node);
 }
 
 static struct file *io_splice_get_file(struct io_kiocb *req,
@@ -59,7 +59,7 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 {
 	struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice);
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_fixed_file *slot;
+	struct io_rsrc_node *node;
 	struct file *file = NULL;
 
 	if (!(sp->flags & SPLICE_F_FD_IN_FIXED))
@@ -69,11 +69,13 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
 		goto out;
 	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	slot = &ctx->file_table.files[sp->splice_fd_in];
-	if (!req->rsrc_node)
-		__io_req_set_rsrc_node(req, ctx);
-	file = io_slot_file(slot);
-	req->flags |= REQ_F_NEED_CLEANUP;
+	node = ctx->file_table.nodes[sp->splice_fd_in];
+	if (node) {
+		node->refs++;
+		sp->rsrc_node = node;
+		file = io_slot_file(node);
+		req->flags |= REQ_F_NEED_CLEANUP;
+	}
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
 	return file;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 6994f60d7ec7..0899c71008ae 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -220,7 +220,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_set_rsrc_node(req, ctx);
+		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
@@ -276,15 +276,11 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-	struct io_ring_ctx *ctx = req->ctx;
+	struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER];
 
 	/* Must have had rsrc_node assigned at prep time */
-	if (req->rsrc_node) {
-		struct io_mapped_ubuf *imu;
-
-		imu = READ_ONCE(ctx->user_bufs[req->buf_index]);
-		return io_import_fixed(rw, iter, imu, ubuf, len);
-	}
+	if (node)
+		return io_import_fixed(rw, iter, node->buf, ubuf, len);
 
 	return -EFAULT;
 }
-- 
cgit v1.2.3


From fbbb8e991d86bb7539de6161746b6c747f93f533 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2024 06:43:44 -0600
Subject: io_uring/rsrc: get rid of io_rsrc_node allocation cache

It's not going to be needed in the fast path going forward, so kill it
off.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 ---
 io_uring/io_uring.c            |  6 +-----
 io_uring/rsrc.c                | 18 ++++++------------
 3 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 42c5f2c992c4..696f2a05a98b 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -370,9 +370,6 @@ struct io_ring_ctx {
 	struct io_rsrc_data		*file_data;
 	struct io_rsrc_data		*buf_data;
 
-	/* protected by ->uring_lock */
-	struct io_alloc_cache		rsrc_node_cache;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0876aa74c739..094788cca47f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -312,9 +312,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	INIT_LIST_HEAD(&ctx->sqd_list);
 	INIT_LIST_HEAD(&ctx->cq_overflow_list);
 	INIT_LIST_HEAD(&ctx->io_buffers_cache);
-	ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX,
-			    sizeof(struct io_rsrc_node));
-	ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
+	ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
 			    sizeof(struct async_poll));
 	ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
 			    sizeof(struct io_async_msghdr));
@@ -358,7 +356,6 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 free_ref:
 	percpu_ref_exit(&ctx->refs);
 err:
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	io_alloc_cache_free(&ctx->apoll_cache, kfree);
 	io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
 	io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
@@ -2740,7 +2737,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 
 	WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
 
-	io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
 	if (ctx->mm_account) {
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 88d698efd75b..8f8147dd714c 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -13,7 +13,6 @@
 #include <uapi/linux/io_uring.h>
 
 #include "io_uring.h"
-#include "alloc_cache.h"
 #include "openclose.h"
 #include "rsrc.h"
 #include "memmap.h"
@@ -129,16 +128,12 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 {
 	struct io_rsrc_node *node;
 
-	node = io_alloc_cache_get(&ctx->rsrc_node_cache);
-	if (!node) {
-		node = kzalloc(sizeof(*node), GFP_KERNEL);
-		if (!node)
-			return NULL;
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (node) {
+		node->ctx = ctx;
+		node->refs = 1;
+		node->type = type;
 	}
-
-	node->ctx = ctx;
-	node->refs = 1;
-	node->type = type;
 	return node;
 }
 
@@ -487,8 +482,7 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 		break;
 	}
 
-	if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node))
-		kfree(node);
+	kfree(node);
 }
 
 static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From 3597f2786b687a7f26361ce00a805ea0af41b65f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 26 Oct 2024 14:50:13 -0600
Subject: io_uring/rsrc: unify file and buffer resource tables

For files, there's nr_user_files/file_table/file_data, and buffers have
nr_user_bufs/user_bufs/buf_data. There's no reason why file_table and
file_data can't be the same thing, and ditto for the buffer side. That
gets rid of more io_ring_ctx state that's in two spots rather than just
being in one spot, as it should be. Put all the registered file data in
one locations, and ditto on the buffer front.

This also avoids having both io_rsrc_data->nodes being an allocated
array, and ->user_bufs[] or ->file_table.nodes. There's no reason to
have this information duplicated. Keep it in one spot, io_rsrc_data,
along with how many resources are available.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  15 ++-
 io_uring/cancel.c              |   4 +-
 io_uring/fdinfo.c              |  10 +-
 io_uring/filetable.c           |  44 ++++-----
 io_uring/filetable.h           |   2 +-
 io_uring/io_uring.c            |   7 +-
 io_uring/msg_ring.c            |   4 +-
 io_uring/net.c                 |   6 +-
 io_uring/nop.c                 |   6 +-
 io_uring/register.c            |   3 +-
 io_uring/rsrc.c                | 209 +++++++++++++----------------------------
 io_uring/rsrc.h                |   7 +-
 io_uring/rw.c                  |   6 +-
 io_uring/splice.c              |   6 +-
 io_uring/uring_cmd.c           |   6 +-
 15 files changed, 123 insertions(+), 212 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 696f2a05a98b..77fd508d043a 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -55,8 +55,13 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
+struct io_rsrc_data {
+	unsigned int			nr;
+	struct io_rsrc_node		**nodes;
+};
+
 struct io_file_table {
-	struct io_rsrc_node **nodes;
+	struct io_rsrc_data data;
 	unsigned long *bitmap;
 	unsigned int alloc_hint;
 };
@@ -276,9 +281,7 @@ struct io_ring_ctx {
 		struct io_wq_work_list	iopoll_list;
 
 		struct io_file_table	file_table;
-		struct io_rsrc_node	**user_bufs;
-		unsigned		nr_user_files;
-		unsigned		nr_user_bufs;
+		struct io_rsrc_data	buf_table;
 
 		struct io_submit_state	submit_state;
 
@@ -366,10 +369,6 @@ struct io_ring_ctx {
 	struct wait_queue_head		poll_wq;
 	struct io_restriction		restrictions;
 
-	/* slow path rsrc auxilary data, used by update/register */
-	struct io_rsrc_data		*file_data;
-	struct io_rsrc_data		*buf_data;
-
 	u32			pers_next;
 	struct xarray		personalities;
 
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index cc3475b22ae5..3a2996307025 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -240,9 +240,9 @@ static int __io_sync_cancel(struct io_uring_task *tctx,
 	/* fixed must be grabbed every time since we drop the uring_lock */
 	if ((cd->flags & IORING_ASYNC_CANCEL_FD) &&
 	    (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) {
-		if (unlikely(fd >= ctx->nr_user_files))
+		if (unlikely(fd >= ctx->file_table.data.nr))
 			return -EBADF;
-		fd = array_index_nospec(fd, ctx->nr_user_files);
+		fd = array_index_nospec(fd, ctx->file_table.data.nr);
 		cd->file = io_file_from_index(&ctx->file_table, fd);
 		if (!cd->file)
 			return -EBADF;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 064a79475c5f..e3f5e9fe5562 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -165,8 +165,8 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 	seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu);
 	seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time);
 	seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time);
-	seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
-	for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
+	seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr);
+	for (i = 0; has_lock && i < ctx->file_table.data.nr; i++) {
 		struct file *f = io_file_from_index(&ctx->file_table, i);
 
 		if (f)
@@ -174,9 +174,9 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 		else
 			seq_printf(m, "%5u: <none>\n", i);
 	}
-	seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
-	for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
-		struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
+	seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr);
+	for (i = 0; has_lock && i < ctx->buf_table.nr; i++) {
+		struct io_mapped_ubuf *buf = ctx->buf_table.nodes[i]->buf;
 
 		seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
 	}
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b12a9a1cc16..c1f9f9550446 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -38,25 +38,19 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
 
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
 {
-	table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
-					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
-	if (unlikely(!table->nodes))
+	if (io_rsrc_data_alloc(&table->data, nr_files))
 		return false;
-
 	table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
-	if (unlikely(!table->bitmap)) {
-		kvfree(table->nodes);
-		return false;
-	}
-
-	return true;
+	if (table->bitmap)
+		return true;
+	io_rsrc_data_free(&table->data);
+	return false;
 }
 
 void io_free_file_tables(struct io_file_table *table)
 {
-	kvfree(table->nodes);
+	io_rsrc_data_free(&table->data);
 	bitmap_free(table->bitmap);
-	table->nodes = NULL;
 	table->bitmap = NULL;
 }
 
@@ -68,22 +62,22 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 
 	if (io_is_uring_fops(file))
 		return -EBADF;
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (slot_index >= ctx->nr_user_files)
+	if (slot_index >= ctx->file_table.data.nr)
 		return -EINVAL;
 
 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
 	if (!node)
 		return -ENOMEM;
 
-	slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
-	if (ctx->file_table.nodes[slot_index])
-		io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
+	slot_index = array_index_nospec(slot_index, ctx->file_table.data.nr);
+	if (ctx->file_table.data.nodes[slot_index])
+		io_put_rsrc_node(ctx->file_table.data.nodes[slot_index]);
 	else
 		io_file_bitmap_set(&ctx->file_table, slot_index);
 
-	ctx->file_table.nodes[slot_index] = node;
+	ctx->file_table.data.nodes[slot_index] = node;
 	io_fixed_file_set(node, file);
 	return 0;
 }
@@ -129,16 +123,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
-	if (unlikely(!ctx->file_data))
+	if (unlikely(!ctx->file_table.data.nr))
 		return -ENXIO;
-	if (offset >= ctx->nr_user_files)
+	if (offset >= ctx->file_table.data.nr)
 		return -EINVAL;
 
-	offset = array_index_nospec(offset, ctx->nr_user_files);
-	if (!ctx->file_table.nodes[offset])
+	offset = array_index_nospec(offset, ctx->file_table.data.nr);
+	if (!ctx->file_table.data.nodes[offset])
 		return -EBADF;
-	io_put_rsrc_node(ctx->file_table.nodes[offset]);
-	ctx->file_table.nodes[offset] = NULL;
+	io_put_rsrc_node(ctx->file_table.data.nodes[offset]);
+	ctx->file_table.data.nodes[offset] = NULL;
 	io_file_bitmap_clear(&ctx->file_table, offset);
 	return 0;
 }
@@ -153,7 +147,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx,
 		return -EFAULT;
 	if (check_add_overflow(range.off, range.len, &end))
 		return -EOVERFLOW;
-	if (range.resv || end > ctx->nr_user_files)
+	if (range.resv || end > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	io_file_table_set_alloc_range(ctx, range.off, range.len);
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 47616079abaa..664c31502dbb 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -52,7 +52,7 @@ static inline struct file *io_slot_file(struct io_rsrc_node *node)
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	struct io_rsrc_node *node = table->nodes[index];
+	struct io_rsrc_node *node = table->data.nodes[index];
 
 	if (node)
 		return io_slot_file(node);
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 60c947114fa3..78df515fb3a7 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1879,11 +1879,10 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	struct file *file = NULL;
 
 	io_ring_submit_lock(ctx, issue_flags);
-
-	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
+	if (unlikely((unsigned int)fd >= ctx->file_table.data.nr))
 		goto out;
-	fd = array_index_nospec(fd, ctx->nr_user_files);
-	node = ctx->file_table.nodes[fd];
+	fd = array_index_nospec(fd, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[fd];
 	if (node) {
 		io_req_assign_rsrc_node(req, node);
 		req->flags |= io_slot_flags(node);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index edea1ffd501c..b90ab3b8f5e0 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -180,8 +180,8 @@ static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_fl
 	int idx = msg->src_fd;
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (likely(idx < ctx->nr_user_files)) {
-		idx = array_index_nospec(idx, ctx->nr_user_files);
+	if (likely(idx < ctx->file_table.data.nr)) {
+		idx = array_index_nospec(idx, ctx->file_table.data.nr);
 		file = io_file_from_index(&ctx->file_table, idx);
 		if (file)
 			get_file(file);
diff --git a/io_uring/net.c b/io_uring/net.c
index ce1156551d10..3e1f31574abb 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1347,9 +1347,9 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (sr->buf_index < ctx->nr_user_bufs) {
-			idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
-			node = ctx->user_bufs[idx];
+		if (sr->buf_index < ctx->buf_table.nr) {
+			idx = array_index_nospec(sr->buf_index, ctx->buf_table.nr);
+			node = ctx->buf_table.nodes[idx];
 			io_req_assign_rsrc_node(sr->notif, node);
 			ret = 0;
 		}
diff --git a/io_uring/nop.c b/io_uring/nop.c
index de91600a3bc6..0dac01127de5 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -66,9 +66,9 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 
 		ret = -EFAULT;
 		io_ring_submit_lock(ctx, issue_flags);
-		if (nop->buffer < ctx->nr_user_bufs) {
-			idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
-			node = READ_ONCE(ctx->user_bufs[idx]);
+		if (nop->buffer < ctx->buf_table.nr) {
+			idx = array_index_nospec(nop->buffer, ctx->buf_table.nr);
+			node = READ_ONCE(ctx->buf_table.nodes[idx]);
 			io_req_assign_rsrc_node(req, node);
 			ret = 0;
 		}
diff --git a/io_uring/register.c b/io_uring/register.c
index 1eb686eaa310..45edfc57963a 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -937,7 +937,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
 	mutex_lock(&ctx->uring_lock);
 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
 	mutex_unlock(&ctx->uring_lock);
-	trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret);
+	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
+				ctx->buf_table.nr, ret);
 	if (!use_registered_ring)
 		fput(file);
 	return ret;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 69a9cd82460d..49a6ab5f3ae9 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -142,39 +142,28 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
 	return node;
 }
 
-static void io_rsrc_data_free(struct io_rsrc_data *data)
+__cold void io_rsrc_data_free(struct io_rsrc_data *data)
 {
-	int i;
-
-	for (i = 0; i < data->nr; i++) {
-		struct io_rsrc_node *node = data->nodes[i];
-
-		if (node)
-			io_put_rsrc_node(node);
+	if (!data->nr)
+		return;
+	while (data->nr--) {
+		if (data->nodes[data->nr])
+			io_put_rsrc_node(data->nodes[data->nr]);
 	}
 	kvfree(data->nodes);
-	kfree(data);
+	data->nodes = NULL;
+	data->nr = 0;
 }
 
-__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, unsigned nr,
-				     struct io_rsrc_data **pdata)
+__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr)
 {
-	struct io_rsrc_data *data;
-
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
 	data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!data->nodes) {
-		kfree(data);
-		return -ENOMEM;
+					GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+	if (data->nodes) {
+		data->nr = nr;
+		return 0;
 	}
-
-	data->nr = nr;
-	*pdata = data;
-	return 0;
+	return -ENOMEM;
 }
 
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
@@ -186,9 +175,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	int fd, i, err = 0;
 	unsigned int done;
 
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_files)
+	if (up->offset + nr_args > ctx->file_table.data.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -206,10 +195,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		if (fd == IORING_REGISTER_FILES_SKIP)
 			continue;
 
-		i = array_index_nospec(up->offset + done, ctx->nr_user_files);
-		if (ctx->file_table.nodes[i]) {
-			io_put_rsrc_node(ctx->file_table.nodes[i]);
-			ctx->file_table.nodes[i] = NULL;
+		i = array_index_nospec(up->offset + done, ctx->file_table.data.nr);
+		if (ctx->file_table.data.nodes[i]) {
+			io_put_rsrc_node(ctx->file_table.data.nodes[i]);
+			ctx->file_table.data.nodes[i] = NULL;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
@@ -234,7 +223,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				fput(file);
 				break;
 			}
-			ctx->file_table.nodes[i] = node;
+			ctx->file_table.data.nodes[i] = node;
 			if (tag)
 				node->tag = tag;
 			io_fixed_file_set(node, file);
@@ -256,9 +245,9 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 	__u32 done;
 	int i, err;
 
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-	if (up->offset + nr_args > ctx->nr_user_bufs)
+	if (up->offset + nr_args > ctx->buf_table.nr)
 		return -EINVAL;
 
 	for (done = 0; done < nr_args; done++) {
@@ -282,16 +271,16 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx,
 			err = -EINVAL;
 			break;
 		}
-		i = array_index_nospec(up->offset + done, ctx->nr_user_bufs);
 		node = io_sqe_buffer_register(ctx, iov, &last_hpage);
 		if (IS_ERR(node)) {
 			err = PTR_ERR(node);
 			break;
 		}
-		if (ctx->user_bufs[i])
-			io_put_rsrc_node(ctx->user_bufs[i]);
+		i = array_index_nospec(up->offset + done, ctx->buf_table.nr);
+		if (ctx->buf_table.nodes[i])
+			io_put_rsrc_node(ctx->buf_table.nodes[i]);
 
-		ctx->user_bufs[i] = node;
+		ctx->buf_table.nodes[i] = node;
 		if (tag)
 			node->tag = tag;
 		if (ctx->compat)
@@ -409,7 +398,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req,
 	struct file *file;
 	int ret, fd;
 
-	if (!req->ctx->file_data)
+	if (!req->ctx->file_table.data.nr)
 		return -ENXIO;
 
 	for (done = 0; done < up->nr_args; done++) {
@@ -494,35 +483,13 @@ void io_free_rsrc_node(struct io_rsrc_node *node)
 	kfree(node);
 }
 
-static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
-{
-	int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct io_rsrc_node *node = ctx->file_table.nodes[i];
-
-		if (node) {
-			io_put_rsrc_node(node);
-			io_file_bitmap_clear(&ctx->file_table, i);
-			ctx->file_table.nodes[i] = NULL;
-		}
-	}
-
-	io_free_file_tables(&ctx->file_table);
-	io_file_table_set_alloc_range(ctx, 0, 0);
-	io_rsrc_data_free(ctx->file_data);
-	ctx->file_data = NULL;
-	ctx->nr_user_files = 0;
-}
-
 int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->file_data)
+	if (!ctx->file_table.data.nr)
 		return -ENXIO;
 
-	__io_sqe_files_unregister(ctx);
+	io_free_file_tables(&ctx->file_table);
+	io_file_table_set_alloc_range(ctx, 0, 0);
 	return 0;
 }
 
@@ -534,7 +501,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 	int fd, ret;
 	unsigned i;
 
-	if (ctx->file_data)
+	if (ctx->file_table.data.nr)
 		return -EBUSY;
 	if (!nr_args)
 		return -EINVAL;
@@ -542,17 +509,10 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		return -EMFILE;
 	if (nr_args > rlimit(RLIMIT_NOFILE))
 		return -EMFILE;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &ctx->file_data);
-	if (ret)
-		return ret;
-
-	if (!io_alloc_file_tables(&ctx->file_table, nr_args)) {
-		io_rsrc_data_free(ctx->file_data);
-		ctx->file_data = NULL;
+	if (!io_alloc_file_tables(&ctx->file_table, nr_args))
 		return -ENOMEM;
-	}
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_files++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -589,44 +549,24 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->file_table.nodes[i] = node;
+		ctx->file_table.data.nodes[i] = node;
 		io_fixed_file_set(node, file);
 		io_file_bitmap_set(&ctx->file_table, i);
 	}
 
 	/* default it to the whole table */
-	io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files);
+	io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
 	return 0;
 fail:
-	__io_sqe_files_unregister(ctx);
+	io_sqe_files_unregister(ctx);
 	return ret;
 }
 
-static void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
-{
-	unsigned int i;
-
-	lockdep_assert_held(&ctx->uring_lock);
-
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		if (ctx->user_bufs[i]) {
-			io_put_rsrc_node(ctx->user_bufs[i]);
-			ctx->user_bufs[i] = NULL;
-		}
-	}
-	kvfree(ctx->user_bufs);
-	ctx->user_bufs = NULL;
-	io_rsrc_data_free(ctx->buf_data);
-	ctx->buf_data = NULL;
-	ctx->nr_user_bufs = 0;
-}
-
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx)
 {
-	if (!ctx->buf_data)
+	if (!ctx->buf_table.nr)
 		return -ENXIO;
-
-	__io_sqe_buffers_unregister(ctx);
+	io_rsrc_data_free(&ctx->buf_table);
 	return 0;
 }
 
@@ -653,8 +593,8 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages,
 	}
 
 	/* check previously registered pages */
-	for (i = 0; i < ctx->nr_user_bufs; i++) {
-		struct io_rsrc_node *node = ctx->user_bufs[i];
+	for (i = 0; i < ctx->buf_table.nr; i++) {
+		struct io_rsrc_node *node = ctx->buf_table.nodes[i];
 		struct io_mapped_ubuf *imu = node->buf;
 
 		for (j = 0; j < imu->nr_bvecs; j++) {
@@ -805,6 +745,9 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
 	struct io_imu_folio_data data;
 	bool coalesced;
 
+	if (!iov->iov_base)
+		return rsrc_empty_node;
+
 	node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
@@ -864,40 +807,29 @@ done:
 	return node;
 }
 
-static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args)
-{
-	ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL);
-	return ctx->user_bufs ? 0 : -ENOMEM;
-}
-
 int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 			    unsigned int nr_args, u64 __user *tags)
 {
 	struct page *last_hpage = NULL;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	struct iovec fast_iov, *iov = &fast_iov;
 	const struct iovec __user *uvec;
 	int i, ret;
 
 	BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16));
 
-	if (ctx->user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS)
 		return -EINVAL;
-	ret = io_rsrc_data_alloc(ctx, nr_args, &data);
+	ret = io_rsrc_data_alloc(&data, nr_args);
 	if (ret)
 		return ret;
-	ret = io_buffers_map_alloc(ctx, nr_args);
-	if (ret) {
-		io_rsrc_data_free(data);
-		return ret;
-	}
 
 	if (!arg)
 		memset(iov, 0, sizeof(*iov));
 
-	for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) {
+	for (i = 0; i < nr_args; i++) {
 		struct io_rsrc_node *node;
 		u64 tag = 0;
 
@@ -935,14 +867,12 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 		}
 		if (tag)
 			node->tag = tag;
-		ctx->user_bufs[i] = node;
+		data.nodes[i] = node;
 	}
 
-	WARN_ON_ONCE(ctx->buf_data);
-
-	ctx->buf_data = data;
+	ctx->buf_table = data;
 	if (ret)
-		__io_sqe_buffers_unregister(ctx);
+		io_sqe_buffers_unregister(ctx);
 	return ret;
 }
 
@@ -1009,8 +939,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 
 static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx)
 {
-	struct io_rsrc_node **user_bufs;
-	struct io_rsrc_data *data;
+	struct io_rsrc_data data;
 	int i, ret, nbufs;
 
 	/*
@@ -1021,43 +950,37 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 
 	mutex_lock(&src_ctx->uring_lock);
 	ret = -ENXIO;
-	nbufs = src_ctx->nr_user_bufs;
+	nbufs = src_ctx->buf_table.nr;
 	if (!nbufs)
 		goto out_unlock;
-	ret = io_rsrc_data_alloc(ctx, nbufs, &data);
+	ret = io_rsrc_data_alloc(&data, nbufs);
 	if (ret)
 		goto out_unlock;
 
-	ret = -ENOMEM;
-	user_bufs = kvmalloc_array(nbufs, sizeof(struct io_rsrc_node *),
-					GFP_KERNEL | __GFP_ZERO);
-	if (!user_bufs)
-		goto out_free_data;
-
 	for (i = 0; i < nbufs; i++) {
-		struct io_rsrc_node *src_node = src_ctx->user_bufs[i];
+		struct io_rsrc_node *src_node = src_ctx->buf_table.nodes[i];
 		struct io_rsrc_node *dst_node;
 
 		if (src_node == rsrc_empty_node) {
 			dst_node = rsrc_empty_node;
 		} else {
 			dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
-			if (!dst_node)
+			if (!dst_node) {
+				ret = -ENOMEM;
 				goto out_put_free;
+			}
 
 			refcount_inc(&src_node->buf->refs);
 			dst_node->buf = src_node->buf;
 		}
-		user_bufs[i] = dst_node;
+		data.nodes[i] = dst_node;
 	}
 
 	/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
-	if (!ctx->user_bufs) {
-		ctx->user_bufs = user_bufs;
-		ctx->buf_data = data;
-		ctx->nr_user_bufs = nbufs;
+	if (!ctx->buf_table.nr) {
+		ctx->buf_table = data;
 		return 0;
 	}
 
@@ -1068,12 +991,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
 	i = nbufs;
 out_put_free:
 	while (i--) {
-		io_buffer_unmap(src_ctx, user_bufs[i]);
-		kfree(user_bufs[i]);
+		io_buffer_unmap(src_ctx, data.nodes[i]);
+		kfree(data.nodes[i]);
 	}
-	kvfree(user_bufs);
-out_free_data:
-	io_rsrc_data_free(data);
+	io_rsrc_data_free(&data);
 out_unlock:
 	mutex_unlock(&src_ctx->uring_lock);
 	mutex_lock(&ctx->uring_lock);
@@ -1094,7 +1015,7 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
 	struct file *file;
 	int ret;
 
-	if (ctx->user_bufs || ctx->nr_user_bufs)
+	if (ctx->buf_table.nr)
 		return -EBUSY;
 	if (copy_from_user(&buf, arg, sizeof(buf)))
 		return -EFAULT;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 48f712488f6b..569ea9ce1405 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -13,11 +13,6 @@ enum {
 	IORING_RSRC_BUFFER		= 1,
 };
 
-struct io_rsrc_data {
-	unsigned int			nr;
-	struct io_rsrc_node		**nodes;
-};
-
 struct io_rsrc_node {
 	struct io_ring_ctx		*ctx;
 	int				refs;
@@ -50,6 +45,8 @@ struct io_imu_folio_data {
 
 struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type);
 void io_free_rsrc_node(struct io_rsrc_node *node);
+void io_rsrc_data_free(struct io_rsrc_data *data);
+int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr);
 
 int io_import_fixed(int ddir, struct iov_iter *iter,
 			   struct io_mapped_ubuf *imu,
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 65491f4f2c7e..28fff18ebb19 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -339,10 +339,10 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (unlikely(ret))
 		return ret;
 
-	if (unlikely(req->buf_index >= ctx->nr_user_bufs))
+	if (unlikely(req->buf_index >= ctx->buf_table.nr))
 		return -EFAULT;
-	index = array_index_nospec(req->buf_index, ctx->nr_user_bufs);
-	node = ctx->user_bufs[index];
+	index = array_index_nospec(req->buf_index, ctx->buf_table.nr);
+	node = ctx->buf_table.nodes[index];
 	io_req_assign_rsrc_node(req, node);
 
 	io = req->async_data;
diff --git a/io_uring/splice.c b/io_uring/splice.c
index f78afb575ae6..aaaddb66e90a 100644
--- a/io_uring/splice.c
+++ b/io_uring/splice.c
@@ -66,10 +66,10 @@ static struct file *io_splice_get_file(struct io_kiocb *req,
 		return io_file_get_normal(req, sp->splice_fd_in);
 
 	io_ring_submit_lock(ctx, issue_flags);
-	if (unlikely(sp->splice_fd_in >= ctx->nr_user_files))
+	if (unlikely(sp->splice_fd_in >= ctx->file_table.data.nr))
 		goto out;
-	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->nr_user_files);
-	node = ctx->file_table.nodes[sp->splice_fd_in];
+	sp->splice_fd_in = array_index_nospec(sp->splice_fd_in, ctx->file_table.data.nr);
+	node = ctx->file_table.data.nodes[sp->splice_fd_in];
 	if (node) {
 		node->refs++;
 		sp->rsrc_node = node;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 0899c71008ae..17d5f5004702 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -212,15 +212,15 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		u16 index;
 
 		index = READ_ONCE(sqe->buf_index);
-		if (unlikely(index >= ctx->nr_user_bufs))
+		if (unlikely(index >= ctx->buf_table.nr))
 			return -EFAULT;
-		req->buf_index = array_index_nospec(index, ctx->nr_user_bufs);
+		req->buf_index = array_index_nospec(index, ctx->buf_table.nr);
 		/*
 		 * Pi node upfront, prior to io_uring_cmd_import_fixed()
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, ctx->user_bufs[req->buf_index]);
+		io_req_assign_rsrc_node(req, ctx->buf_table.nodes[req->buf_index]);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
-- 
cgit v1.2.3


From 01ee194d1aba1202f0926d5047a2a4cf84d0e45d Mon Sep 17 00:00:00 2001
From: hexue <xue01.he@samsung.com>
Date: Fri, 1 Nov 2024 17:19:57 +0800
Subject: io_uring: add support for hybrid IOPOLL

A new hybrid poll is implemented on the io_uring layer. Once an IO is
issued, it will not poll immediately, but rather block first and re-run
before IO complete, then poll to reap IO. While this poll method could
be a suboptimal solution when running on a single thread, it offers
performance lower than regular polling but higher than IRQ, and CPU
utilization is also lower than polling.

To use hybrid polling, the ring must be setup with both the
IORING_SETUP_IOPOLL and IORING_SETUP_HYBRID)IOPOLL flags set. Hybrid
polling has the same restrictions as IOPOLL, in that commands must
explicitly support it.

Signed-off-by: hexue <xue01.he@samsung.com>
Link: https://lore.kernel.org/r/20241101091957.564220-2-xue01.he@samsung.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 19 ++++++++-
 include/uapi/linux/io_uring.h  |  3 ++
 io_uring/io_uring.c            |  8 +++-
 io_uring/rw.c                  | 92 +++++++++++++++++++++++++++++++++++++-----
 4 files changed, 108 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 77fd508d043a..d52fec533c51 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -298,6 +298,11 @@ struct io_ring_ctx {
 		 * ->uring_cmd() by io_uring_cmd_insert_cancelable()
 		 */
 		struct hlist_head	cancelable_uring_cmd;
+		/*
+		 * For Hybrid IOPOLL, runtime in hybrid polling, without
+		 * scheduling time
+		 */
+		u64					hybrid_poll_time;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -449,6 +454,7 @@ enum {
 	REQ_F_LINK_TIMEOUT_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_POLLED_BIT,
+	REQ_F_HYBRID_IOPOLL_STATE_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_BUFFER_RING_BIT,
 	REQ_F_REISSUE_BIT,
@@ -507,6 +513,8 @@ enum {
 	REQ_F_NEED_CLEANUP	= IO_REQ_FLAG(REQ_F_NEED_CLEANUP_BIT),
 	/* already went through poll handler */
 	REQ_F_POLLED		= IO_REQ_FLAG(REQ_F_POLLED_BIT),
+	/* every req only blocks once in hybrid poll */
+	REQ_F_IOPOLL_STATE        = IO_REQ_FLAG(REQ_F_HYBRID_IOPOLL_STATE_BIT),
 	/* buffer already selected */
 	REQ_F_BUFFER_SELECTED	= IO_REQ_FLAG(REQ_F_BUFFER_SELECTED_BIT),
 	/* buffer selected from ring, needs commit */
@@ -639,8 +647,15 @@ struct io_kiocb {
 	atomic_t			refs;
 	bool				cancel_seq_set;
 	struct io_task_work		io_task_work;
-	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-	struct hlist_node		hash_node;
+	union {
+		/*
+		 * for polled requests, i.e. IORING_OP_POLL_ADD and async armed
+		 * poll
+		 */
+		struct hlist_node	hash_node;
+		/* For IOPOLL setup queues, with hybrid polling */
+		u64                     iopoll_start;
+	};
 	/* internal polling, see IORING_FEAT_FAST_POLL */
 	struct async_poll		*apoll;
 	/* opcode allocated if it needs to store data for async defer */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index ce58c4590de6..47977a5c65f5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -200,6 +200,9 @@ enum io_uring_sqe_flags_bit {
  */
 #define IORING_SETUP_NO_SQARRAY		(1U << 16)
 
+/* Use hybrid poll in iopoll process */
+#define IORING_SETUP_HYBRID_IOPOLL	(1U << 17)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 44a772013c09..f08ea7fd5998 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -307,6 +307,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 		goto err;
 
 	ctx->flags = p->flags;
+	ctx->hybrid_poll_time = LLONG_MAX;
 	atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT);
 	init_waitqueue_head(&ctx->sqo_sq_wait);
 	INIT_LIST_HEAD(&ctx->sqd_list);
@@ -3630,6 +3631,11 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
 		static_branch_inc(&io_key_has_sqarray);
 
+	/* HYBRID_IOPOLL only valid with IOPOLL */
+	if ((ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_HYBRID_IOPOLL)) ==
+			IORING_SETUP_HYBRID_IOPOLL)
+		return -EINVAL;
+
 	if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) &&
 	    !(ctx->flags & IORING_SETUP_IOPOLL) &&
 	    !(ctx->flags & IORING_SETUP_SQPOLL))
@@ -3785,7 +3791,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
 			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY |
-			IORING_SETUP_NO_SQARRAY))
+			IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 30448f343c7f..1ea6be2ccc90 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -817,6 +817,11 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) {
+			/* make sure every req only blocks once*/
+			req->flags &= ~REQ_F_IOPOLL_STATE;
+			req->iopoll_start = ktime_get_ns();
+		}
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;
@@ -1115,6 +1120,78 @@ void io_rw_fail(struct io_kiocb *req)
 	io_req_set_res(req, res, req->cqe.flags);
 }
 
+static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob,
+				unsigned int poll_flags)
+{
+	struct file *file = req->file;
+
+	if (req->opcode == IORING_OP_URING_CMD) {
+		struct io_uring_cmd *ioucmd;
+
+		ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
+		return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags);
+	} else {
+		struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+
+		return file->f_op->iopoll(&rw->kiocb, iob, poll_flags);
+	}
+}
+
+static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req)
+{
+	struct hrtimer_sleeper timer;
+	enum hrtimer_mode mode;
+	ktime_t kt;
+	u64 sleep_time;
+
+	if (req->flags & REQ_F_IOPOLL_STATE)
+		return 0;
+
+	if (ctx->hybrid_poll_time == LLONG_MAX)
+		return 0;
+
+	/* Using half the running time to do schedule */
+	sleep_time = ctx->hybrid_poll_time / 2;
+
+	kt = ktime_set(0, sleep_time);
+	req->flags |= REQ_F_IOPOLL_STATE;
+
+	mode = HRTIMER_MODE_REL;
+	hrtimer_init_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode);
+	hrtimer_set_expires(&timer.timer, kt);
+	set_current_state(TASK_INTERRUPTIBLE);
+	hrtimer_sleeper_start_expires(&timer, mode);
+
+	if (timer.task)
+		io_schedule();
+
+	hrtimer_cancel(&timer.timer);
+	__set_current_state(TASK_RUNNING);
+	destroy_hrtimer_on_stack(&timer.timer);
+	return sleep_time;
+}
+
+static int io_uring_hybrid_poll(struct io_kiocb *req,
+				struct io_comp_batch *iob, unsigned int poll_flags)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	u64 runtime, sleep_time;
+	int ret;
+
+	sleep_time = io_hybrid_iopoll_delay(ctx, req);
+	ret = io_uring_classic_poll(req, iob, poll_flags);
+	runtime = ktime_get_ns() - req->iopoll_start - sleep_time;
+
+	/*
+	 * Use minimum sleep time if we're polling devices with different
+	 * latencies. We could get more completions from the faster ones.
+	 */
+	if (ctx->hybrid_poll_time > runtime)
+		ctx->hybrid_poll_time = runtime;
+
+	return ret;
+}
+
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 {
 	struct io_wq_work_node *pos, *start, *prev;
@@ -1131,7 +1208,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 
 	wq_list_for_each(pos, start, &ctx->iopoll_list) {
 		struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list);
-		struct file *file = req->file;
 		int ret;
 
 		/*
@@ -1142,17 +1218,11 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
 		if (READ_ONCE(req->iopoll_completed))
 			break;
 
-		if (req->opcode == IORING_OP_URING_CMD) {
-			struct io_uring_cmd *ioucmd;
-
-			ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd);
-			ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob,
-								poll_flags);
-		} else {
-			struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+		if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL)
+			ret = io_uring_hybrid_poll(req, &iob, poll_flags);
+		else
+			ret = io_uring_classic_poll(req, &iob, poll_flags);
 
-			ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags);
-		}
 		if (unlikely(ret < 0))
 			return ret;
 		else if (ret)
-- 
cgit v1.2.3


From 6f94cbc29adacc15007c5a16295052e674099282 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 3 Nov 2024 08:46:07 -0700
Subject: io_uring/rsrc: split io_kiocb node type assignments

Currently the io_rsrc_node assignment in io_kiocb is an array of two
pointers, as two nodes may be assigned to a request - one file node,
and one buffer node. However, the buffer node can co-exist with the
provided buffers, as currently it's not supported to use both provided
and registered buffers at the same time.

This crucially brings struct io_kiocb down to 4 cache lines again, as
before it spilled into the 5th cacheline.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  7 ++++++-
 io_uring/io_uring.c            |  6 +++---
 io_uring/net.c                 |  3 ++-
 io_uring/nop.c                 |  3 ++-
 io_uring/notif.c               |  4 ++--
 io_uring/rsrc.h                | 16 ++++++++++------
 io_uring/rw.c                  |  3 ++-
 io_uring/uring_cmd.c           |  4 ++--
 8 files changed, 29 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index d52fec533c51..01e7fb9fcfe2 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -475,6 +475,7 @@ enum {
 	REQ_F_BL_EMPTY_BIT,
 	REQ_F_BL_NO_RECYCLE_BIT,
 	REQ_F_BUFFERS_COMMIT_BIT,
+	REQ_F_BUF_NODE_BIT,
 
 	/* not a real bit, just to check we're not overflowing the space */
 	__REQ_F_LAST_BIT,
@@ -553,6 +554,8 @@ enum {
 	REQ_F_BL_NO_RECYCLE	= IO_REQ_FLAG(REQ_F_BL_NO_RECYCLE_BIT),
 	/* buffer ring head needs incrementing on put */
 	REQ_F_BUFFERS_COMMIT	= IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT),
+	/* buf node is valid */
+	REQ_F_BUF_NODE		= IO_REQ_FLAG(REQ_F_BUF_NODE_BIT),
 };
 
 typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts);
@@ -633,6 +636,8 @@ struct io_kiocb {
 		 * REQ_F_BUFFER_RING is set.
 		 */
 		struct io_buffer_list	*buf_list;
+
+		struct io_rsrc_node	*buf_node;
 	};
 
 	union {
@@ -642,7 +647,7 @@ struct io_kiocb {
 		__poll_t apoll_events;
 	};
 
-	struct io_rsrc_node		*rsrc_nodes[2];
+	struct io_rsrc_node		*file_node;
 
 	atomic_t			refs;
 	bool				cancel_seq_set;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f08ea7fd5998..5bab8a3b0456 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -948,8 +948,8 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res)
 static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
 {
 	req->ctx = ctx;
-	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	req->buf_node = NULL;
+	req->file_node = NULL;
 	req->link = NULL;
 	req->async_data = NULL;
 	/* not necessary, but safer to zero */
@@ -1882,7 +1882,7 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 	io_ring_submit_lock(ctx, issue_flags);
 	node = io_rsrc_node_lookup(&ctx->file_table.data, fd);
 	if (node) {
-		io_req_assign_rsrc_node(req, node);
+		io_req_assign_rsrc_node(&req->file_node, node);
 		req->flags |= io_slot_flags(node);
 		file = io_slot_file(node);
 	}
diff --git a/io_uring/net.c b/io_uring/net.c
index 2f7b334ed708..2ccc2b409431 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -1348,7 +1348,8 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
 		io_ring_submit_lock(ctx, issue_flags);
 		node = io_rsrc_node_lookup(&ctx->buf_table, sr->buf_index);
 		if (node) {
-			io_req_assign_rsrc_node(sr->notif, node);
+			io_req_assign_rsrc_node(&sr->notif->buf_node, node);
+			sr->notif->flags |= REQ_F_BUF_NODE;
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/nop.c b/io_uring/nop.c
index 149dbdc53607..bc22bcc739f3 100644
--- a/io_uring/nop.c
+++ b/io_uring/nop.c
@@ -67,7 +67,8 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
 		io_ring_submit_lock(ctx, issue_flags);
 		node = io_rsrc_node_lookup(&ctx->buf_table, nop->buffer);
 		if (node) {
-			io_req_assign_rsrc_node(req, node);
+			io_req_assign_rsrc_node(&req->buf_node, node);
+			req->flags |= REQ_F_BUF_NODE;
 			ret = 0;
 		}
 		io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 4f02e969cf08..8dfbb0bd8e4d 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -117,8 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->file = NULL;
 	notif->task = current;
 	io_get_task_refs(1);
-	notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	notif->file_node = NULL;
+	notif->buf_node = NULL;
 
 	nd = io_notif_to_data(notif);
 	nd->zc_report = false;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index 9a8fac31fa49..bc3a863b14bb 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -95,10 +95,14 @@ static inline bool io_reset_rsrc_node(struct io_rsrc_data *data, int index)
 
 static inline void io_req_put_rsrc_nodes(struct io_kiocb *req)
 {
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_FILE]);
-	io_put_rsrc_node(req->rsrc_nodes[IORING_RSRC_BUFFER]);
-	req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
-	req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
+	if (req->file_node) {
+		io_put_rsrc_node(req->file_node);
+		req->file_node = NULL;
+	}
+	if (req->flags & REQ_F_BUF_NODE) {
+		io_put_rsrc_node(req->buf_node);
+		req->buf_node = NULL;
+	}
 }
 
 static inline struct io_ring_ctx *io_rsrc_node_ctx(struct io_rsrc_node *node)
@@ -111,11 +115,11 @@ static inline int io_rsrc_node_type(struct io_rsrc_node *node)
 	return node->ctx_ptr & IORING_RSRC_TYPE_MASK;
 }
 
-static inline void io_req_assign_rsrc_node(struct io_kiocb *req,
+static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node,
 					   struct io_rsrc_node *node)
 {
 	node->refs++;
-	req->rsrc_nodes[io_rsrc_node_type(node)] = node;
+	*dst_node = node;
 }
 
 int io_files_update(struct io_kiocb *req, unsigned int issue_flags);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1ea6be2ccc90..144730344c0f 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -341,7 +341,8 @@ static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index);
 	if (!node)
 		return -EFAULT;
-	io_req_assign_rsrc_node(req, node);
+	io_req_assign_rsrc_node(&req->buf_node, node);
+	req->flags |= REQ_F_BUF_NODE;
 
 	io = req->async_data;
 	ret = io_import_fixed(ddir, &io->iter, node->buf, rw->addr, rw->len);
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index 535909a38e76..88a73d21fc0b 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -219,7 +219,7 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		 * being called. This prevents destruction of the mapped buffer
 		 * we'll need at actual import time.
 		 */
-		io_req_assign_rsrc_node(req, node);
+		io_req_assign_rsrc_node(&req->buf_node, node);
 	}
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 
@@ -275,7 +275,7 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
 			      struct iov_iter *iter, void *ioucmd)
 {
 	struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
-	struct io_rsrc_node *node = req->rsrc_nodes[IORING_RSRC_BUFFER];
+	struct io_rsrc_node *node = req->buf_node;
 
 	/* Must have had rsrc_node assigned at prep time */
 	if (node)
-- 
cgit v1.2.3


From b6f58a3f4aa8dba424356c7a69388a81f4459300 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 3 Nov 2024 10:23:38 -0700
Subject: io_uring: move struct io_kiocb from task_struct to io_uring_task

Rather than store the task_struct itself in struct io_kiocb, store
the io_uring specific task_struct. The life times are the same in terms
of io_uring, and this avoids doing some dereferences through the
task_struct. For the hot path of putting local task references, we can
deref req->tctx instead, which we'll need anyway in that function
regardless of whether it's local or remote references.

This is mostly straight forward, except the original task PF_EXITING
check needs a bit of tweaking. task_work is _always_ run from the
originating task, except in the fallback case, where it's run from a
kernel thread. Replace the potentially racy (in case of fallback work)
checks for req->task->flags with current->flags. It's either the still
the original task, in which case PF_EXITING will be sane, or it has
PF_KTHREAD set, in which case it's fallback work. Both cases should
prevent moving forward with the given request.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring/cmd.h   |  2 +-
 include/linux/io_uring_types.h |  3 ++-
 io_uring/cancel.c              |  2 +-
 io_uring/fdinfo.c              |  2 +-
 io_uring/io_uring.c            | 34 +++++++++++++++-------------------
 io_uring/io_uring.h            | 13 +++++++++++++
 io_uring/msg_ring.c            |  4 ++--
 io_uring/notif.c               |  4 ++--
 io_uring/poll.c                |  3 +--
 io_uring/rw.c                  |  2 +-
 io_uring/tctx.c                |  1 +
 io_uring/timeout.c             | 10 ++++++----
 io_uring/uring_cmd.c           |  2 +-
 io_uring/waitid.c              |  2 +-
 14 files changed, 48 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring/cmd.h b/include/linux/io_uring/cmd.h
index c189d36ad55e..578a3fdf5c71 100644
--- a/include/linux/io_uring/cmd.h
+++ b/include/linux/io_uring/cmd.h
@@ -110,7 +110,7 @@ static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
 
 static inline struct task_struct *io_uring_cmd_get_task(struct io_uring_cmd *cmd)
 {
-	return cmd_to_io_kiocb(cmd)->task;
+	return cmd_to_io_kiocb(cmd)->tctx->task;
 }
 
 #endif /* _LINUX_IO_URING_CMD_H */
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 01e7fb9fcfe2..fba2988accc3 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -84,6 +84,7 @@ struct io_uring_task {
 	/* submission side */
 	int				cached_refs;
 	const struct io_ring_ctx 	*last;
+	struct task_struct		*task;
 	struct io_wq			*io_wq;
 	struct file			*registered_rings[IO_RINGFD_REG_MAX];
 
@@ -625,7 +626,7 @@ struct io_kiocb {
 	struct io_cqe			cqe;
 
 	struct io_ring_ctx		*ctx;
-	struct task_struct		*task;
+	struct io_uring_task		*tctx;
 
 	union {
 		/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
diff --git a/io_uring/cancel.c b/io_uring/cancel.c
index bbca5cb69cb5..484193567839 100644
--- a/io_uring/cancel.c
+++ b/io_uring/cancel.c
@@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags)
 		.opcode	= cancel->opcode,
 		.seq	= atomic_inc_return(&req->ctx->cancel_seq),
 	};
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 	int ret;
 
 	if (cd.flags & IORING_ASYNC_CANCEL_FD) {
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 8da0d9e4533a..efbec34ccb18 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -203,7 +203,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 
 		hlist_for_each_entry(req, &hb->list, hash_node)
 			seq_printf(m, "  op=%d, task_works=%d\n", req->opcode,
-					task_work_pending(req->task));
+					task_work_pending(req->tctx->task));
 	}
 
 	if (has_lock)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43afd9da7d07..7c1ca36b117b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -206,7 +206,7 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
 {
 	bool matched;
 
-	if (tctx && head->task->io_uring != tctx)
+	if (tctx && head->tctx != tctx)
 		return false;
 	if (cancel_all)
 		return true;
@@ -407,11 +407,8 @@ static void io_clean_op(struct io_kiocb *req)
 		kfree(req->apoll);
 		req->apoll = NULL;
 	}
-	if (req->flags & REQ_F_INFLIGHT) {
-		struct io_uring_task *tctx = req->task->io_uring;
-
-		atomic_dec(&tctx->inflight_tracked);
-	}
+	if (req->flags & REQ_F_INFLIGHT)
+		atomic_dec(&req->tctx->inflight_tracked);
 	if (req->flags & REQ_F_CREDS)
 		put_cred(req->creds);
 	if (req->flags & REQ_F_ASYNC_DATA) {
@@ -425,7 +422,7 @@ static inline void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
 		req->flags |= REQ_F_INFLIGHT;
-		atomic_inc(&req->task->io_uring->inflight_tracked);
+		atomic_inc(&req->tctx->inflight_tracked);
 	}
 }
 
@@ -514,7 +511,7 @@ static void io_prep_async_link(struct io_kiocb *req)
 static void io_queue_iowq(struct io_kiocb *req)
 {
 	struct io_kiocb *link = io_prep_linked_timeout(req);
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 
 	BUG_ON(!tctx);
 	BUG_ON(!tctx->io_wq);
@@ -529,7 +526,7 @@ static void io_queue_iowq(struct io_kiocb *req)
 	 * procedure rather than attempt to run this request (or create a new
 	 * worker for it).
 	 */
-	if (WARN_ON_ONCE(!same_thread_group(req->task, current)))
+	if (WARN_ON_ONCE(!same_thread_group(tctx->task, current)))
 		atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags);
 
 	trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work));
@@ -678,17 +675,17 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
 }
 
 /* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task)
+static inline void io_put_task(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 
-	if (likely(task == current)) {
+	if (likely(tctx->task == current)) {
 		tctx->cached_refs++;
 	} else {
 		percpu_counter_sub(&tctx->inflight, 1);
 		if (unlikely(atomic_read(&tctx->in_cancel)))
 			wake_up(&tctx->wait);
-		put_task_struct(task);
+		put_task_struct(tctx->task);
 	}
 }
 
@@ -1207,7 +1204,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req,
 
 static void io_req_normal_work_add(struct io_kiocb *req)
 {
-	struct io_uring_task *tctx = req->task->io_uring;
+	struct io_uring_task *tctx = req->tctx;
 	struct io_ring_ctx *ctx = req->ctx;
 
 	/* task_work already pending, we're done */
@@ -1226,7 +1223,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 		return;
 	}
 
-	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
+	if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
 	io_fallback_tw(tctx, false);
@@ -1343,8 +1340,7 @@ static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
 void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	io_tw_lock(req->ctx, ts);
-	/* req->task == current here, checking PF_EXITING is safe */
-	if (unlikely(req->task->flags & PF_EXITING))
+	if (unlikely(io_should_terminate_tw()))
 		io_req_defer_failed(req, -EFAULT);
 	else if (req->flags & REQ_F_FORCE_ASYNC)
 		io_queue_iowq(req);
@@ -1403,7 +1399,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
 		}
 		io_put_file(req);
 		io_req_put_rsrc_nodes(req);
-		io_put_task(req->task);
+		io_put_task(req);
 
 		node = req->comp_list.next;
 		io_req_add_to_cache(req, ctx);
@@ -2019,7 +2015,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	req->flags = (__force io_req_flags_t) sqe_flags;
 	req->cqe.user_data = READ_ONCE(sqe->user_data);
 	req->file = NULL;
-	req->task = current;
+	req->tctx = current->io_uring;
 	req->cancel_seq_set = false;
 
 	if (unlikely(opcode >= IORING_OP_LAST)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 17ffdb1e41c5..702c8e987430 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -426,6 +426,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx)
 		      ctx->submitter_task == current);
 }
 
+/*
+ * Terminate the request if either of these conditions are true:
+ *
+ * 1) It's being executed by the original task, but that task is marked
+ *    with PF_EXITING as it's exiting.
+ * 2) PF_KTHREAD is set, in which case the invoker of the task_work is
+ *    our fallback task_work.
+ */
+static inline bool io_should_terminate_tw(void)
+{
+	return current->flags & (PF_KTHREAD | PF_EXITING);
+}
+
 static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res)
 {
 	io_req_set_res(req, res, 0);
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 99af39e1d0fb..e63af34004b7 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -89,8 +89,8 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
 			      int res, u32 cflags, u64 user_data)
 {
-	req->task = READ_ONCE(ctx->submitter_task);
-	if (!req->task) {
+	req->tctx = READ_ONCE(ctx->submitter_task->io_uring);
+	if (!req->tctx) {
 		kmem_cache_free(req_cachep, req);
 		return -EOWNERDEAD;
 	}
diff --git a/io_uring/notif.c b/io_uring/notif.c
index 8dfbb0bd8e4d..ee3a33510b3c 100644
--- a/io_uring/notif.c
+++ b/io_uring/notif.c
@@ -89,7 +89,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg)
 
 	/* make sure all noifications can be finished in the same task_work */
 	if (unlikely(notif->ctx != prev_notif->ctx ||
-		     notif->task != prev_notif->task))
+		     notif->tctx != prev_notif->tctx))
 		return -EEXIST;
 
 	nd->head = prev_nd->head;
@@ -115,7 +115,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
 	notif->opcode = IORING_OP_NOP;
 	notif->flags = 0;
 	notif->file = NULL;
-	notif->task = current;
+	notif->tctx = current->io_uring;
 	io_get_task_refs(1);
 	notif->file_node = NULL;
 	notif->buf_node = NULL;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 7db3010b5733..bced9edd5233 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -224,8 +224,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int v;
 
-	/* req->task == current here, checking PF_EXITING is safe */
-	if (unlikely(req->task->flags & PF_EXITING))
+	if (unlikely(io_should_terminate_tw()))
 		return -ECANCELED;
 
 	do {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 144730344c0f..e368b9afde03 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -435,7 +435,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
 	 * Play it safe and assume not safe to re-import and reissue if we're
 	 * not in the original thread group (or in task context).
 	 */
-	if (!same_thread_group(req->task, current) || !in_task())
+	if (!same_thread_group(req->tctx->task, current) || !in_task())
 		return false;
 	return true;
 }
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index c043fe93a3f2..503f3ff8bc4f 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -81,6 +81,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task,
 		return ret;
 	}
 
+	tctx->task = task;
 	xa_init(&tctx->xa);
 	init_waitqueue_head(&tctx->wait);
 	atomic_set(&tctx->in_cancel, 0);
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 18286cb53a69..5b12bd6a804c 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -300,16 +300,18 @@ static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *t
 {
 	struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout);
 	struct io_kiocb *prev = timeout->prev;
-	int ret = -ENOENT;
+	int ret;
 
 	if (prev) {
-		if (!(req->task->flags & PF_EXITING)) {
+		if (!io_should_terminate_tw()) {
 			struct io_cancel_data cd = {
 				.ctx		= req->ctx,
 				.data		= prev->cqe.user_data,
 			};
 
-			ret = io_try_cancel(req->task->io_uring, &cd, 0);
+			ret = io_try_cancel(req->tctx, &cd, 0);
+		} else {
+			ret = -ECANCELED;
 		}
 		io_req_set_res(req, ret ?: -ETIME, 0);
 		io_req_task_complete(req, ts);
@@ -643,7 +645,7 @@ static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx,
 {
 	struct io_kiocb *req;
 
-	if (tctx && head->task->io_uring != tctx)
+	if (tctx && head->tctx != tctx)
 		return false;
 	if (cancel_all)
 		return true;
diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c
index f88fbc9869d0..40b8b777ba12 100644
--- a/io_uring/uring_cmd.c
+++ b/io_uring/uring_cmd.c
@@ -61,7 +61,7 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx,
 				struct io_uring_cmd);
 		struct file *file = req->file;
 
-		if (!cancel_all && req->task->io_uring != tctx)
+		if (!cancel_all && req->tctx != tctx)
 			continue;
 
 		if (cmd->flags & IORING_URING_CMD_CANCELABLE) {
diff --git a/io_uring/waitid.c b/io_uring/waitid.c
index 9b7c23f96c47..daef5dd644f0 100644
--- a/io_uring/waitid.c
+++ b/io_uring/waitid.c
@@ -331,7 +331,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags)
 	hlist_add_head(&req->hash_node, &ctx->waitid_list);
 
 	init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait);
-	iwa->wo.child_wait.private = req->task;
+	iwa->wo.child_wait.private = req->tctx->task;
 	iw->head = &current->signal->wait_chldexit;
 	add_wait_queue(iw->head, &iwa->wo.child_wait);
 
-- 
cgit v1.2.3


From 6bf90bd8c58a305994948eb3409d91a7d8f2edae Mon Sep 17 00:00:00 2001
From: Olivier Langlois <olivier@trillion01.com>
Date: Sun, 13 Oct 2024 14:29:24 -0400
Subject: io_uring/napi: add static napi tracking strategy

Add the static napi tracking strategy. That allows the user to manually
manage the napi ids list for busy polling, and eliminate the overhead of
dynamically updating the list from the fast path.

Signed-off-by: Olivier Langlois <olivier@trillion01.com>
Link: https://lore.kernel.org/r/96943de14968c35a5c599352259ad98f3c0770ba.1728828877.git.olivier@trillion01.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  2 +-
 include/uapi/linux/io_uring.h  | 32 +++++++++++++-
 io_uring/fdinfo.c              | 54 +++++++++++++++++------
 io_uring/napi.c                | 97 +++++++++++++++++++++++++++++++++++++-----
 io_uring/napi.h                |  2 +-
 5 files changed, 160 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index fba2988accc3..072e65e93105 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -408,7 +408,7 @@ struct io_ring_ctx {
 	/* napi busy poll default timeout */
 	ktime_t			napi_busy_poll_dt;
 	bool			napi_prefer_busy_poll;
-	bool			napi_enabled;
+	u8			napi_track_mode;
 
 	DECLARE_HASHTABLE(napi_ht, 4);
 #endif
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 47977a5c65f5..5d08435b95a8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -790,12 +790,40 @@ struct io_uring_buf_status {
 	__u32	resv[8];
 };
 
+enum io_uring_napi_op {
+	/* register/ungister backward compatible opcode */
+	IO_URING_NAPI_REGISTER_OP = 0,
+
+	/* opcodes to update napi_list when static tracking is used */
+	IO_URING_NAPI_STATIC_ADD_ID = 1,
+	IO_URING_NAPI_STATIC_DEL_ID = 2
+};
+
+enum io_uring_napi_tracking_strategy {
+	/* value must be 0 for backward compatibility */
+	IO_URING_NAPI_TRACKING_DYNAMIC = 0,
+	IO_URING_NAPI_TRACKING_STATIC = 1,
+	IO_URING_NAPI_TRACKING_INACTIVE = 255
+};
+
 /* argument for IORING_(UN)REGISTER_NAPI */
 struct io_uring_napi {
 	__u32	busy_poll_to;
 	__u8	prefer_busy_poll;
-	__u8	pad[3];
-	__u64	resv;
+
+	/* a io_uring_napi_op value */
+	__u8	opcode;
+	__u8	pad[2];
+
+	/*
+	 * for IO_URING_NAPI_REGISTER_OP, it is a
+	 * io_uring_napi_tracking_strategy value.
+	 *
+	 * for IO_URING_NAPI_STATIC_ADD_ID/IO_URING_NAPI_STATIC_DEL_ID
+	 * it is the napi id to add/del from napi_list.
+	 */
+	__u32	op_param;
+	__u32	resv;
 };
 
 /*
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index efbec34ccb18..b214e5a407b5 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -46,6 +46,46 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id,
 	return 0;
 }
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx,
+					       struct seq_file *m,
+					       const char *tracking_strategy)
+{
+	seq_puts(m, "NAPI:\tenabled\n");
+	seq_printf(m, "napi tracking:\t%s\n", tracking_strategy);
+	seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
+	if (ctx->napi_prefer_busy_poll)
+		seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
+	else
+		seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
+}
+
+static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+	unsigned int mode = READ_ONCE(ctx->napi_track_mode);
+
+	switch (mode) {
+	case IO_URING_NAPI_TRACKING_INACTIVE:
+		seq_puts(m, "NAPI:\tdisabled\n");
+		break;
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+		common_tracking_show_fdinfo(ctx, m, "dynamic");
+		break;
+	case IO_URING_NAPI_TRACKING_STATIC:
+		common_tracking_show_fdinfo(ctx, m, "static");
+		break;
+	default:
+		seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode);
+	}
+}
+#else
+static inline void napi_show_fdinfo(struct io_ring_ctx *ctx,
+				    struct seq_file *m)
+{
+}
+#endif
+
 /*
  * Caller holds a reference to the file already, we don't need to do
  * anything else to get an extra reference.
@@ -219,18 +259,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
 
 	}
 	spin_unlock(&ctx->completion_lock);
-
-#ifdef CONFIG_NET_RX_BUSY_POLL
-	if (ctx->napi_enabled) {
-		seq_puts(m, "NAPI:\tenabled\n");
-		seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt);
-		if (ctx->napi_prefer_busy_poll)
-			seq_puts(m, "napi_prefer_busy_poll:\ttrue\n");
-		else
-			seq_puts(m, "napi_prefer_busy_poll:\tfalse\n");
-	} else {
-		seq_puts(m, "NAPI:\tdisabled\n");
-	}
-#endif
+	napi_show_fdinfo(ctx, m);
 }
 #endif
diff --git a/io_uring/napi.c b/io_uring/napi.c
index 1de1543d8034..b1ade3fda30f 100644
--- a/io_uring/napi.c
+++ b/io_uring/napi.c
@@ -81,6 +81,27 @@ int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id)
 	return 0;
 }
 
+static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id)
+{
+	struct hlist_head *hash_list;
+	struct io_napi_entry *e;
+
+	/* Non-NAPI IDs can be rejected. */
+	if (napi_id < MIN_NAPI_ID)
+		return -EINVAL;
+
+	hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))];
+	guard(spinlock)(&ctx->napi_lock);
+	e = io_napi_hash_find(hash_list, napi_id);
+	if (!e)
+		return -ENOENT;
+
+	list_del_rcu(&e->list);
+	hash_del_rcu(&e->node);
+	kfree_rcu(e, rcu);
+	return 0;
+}
+
 static void __io_napi_remove_stale(struct io_ring_ctx *ctx)
 {
 	struct io_napi_entry *e;
@@ -136,9 +157,25 @@ static bool io_napi_busy_loop_should_end(void *data,
 	return false;
 }
 
-static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
-				   bool (*loop_end)(void *, unsigned long),
-				   void *loop_end_arg)
+/*
+ * never report stale entries
+ */
+static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+					 bool (*loop_end)(void *, unsigned long),
+					 void *loop_end_arg)
+{
+	struct io_napi_entry *e;
+
+	list_for_each_entry_rcu(e, &ctx->napi_list, list)
+		napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg,
+				   ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET);
+	return false;
+}
+
+static bool
+dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx,
+			      bool (*loop_end)(void *, unsigned long),
+			      void *loop_end_arg)
 {
 	struct io_napi_entry *e;
 	bool is_stale = false;
@@ -154,6 +191,16 @@ static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx,
 	return is_stale;
 }
 
+static inline bool
+__io_napi_do_busy_loop(struct io_ring_ctx *ctx,
+		       bool (*loop_end)(void *, unsigned long),
+		       void *loop_end_arg)
+{
+	if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC)
+		return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+	return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg);
+}
+
 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx,
 				       struct io_wait_queue *iowq)
 {
@@ -195,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx)
 	spin_lock_init(&ctx->napi_lock);
 	ctx->napi_prefer_busy_poll = false;
 	ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt);
+	ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE;
 }
 
 /*
@@ -215,6 +263,24 @@ void io_napi_free(struct io_ring_ctx *ctx)
 	INIT_LIST_HEAD_RCU(&ctx->napi_list);
 }
 
+static int io_napi_register_napi(struct io_ring_ctx *ctx,
+				 struct io_uring_napi *napi)
+{
+	switch (napi->op_param) {
+	case IO_URING_NAPI_TRACKING_DYNAMIC:
+	case IO_URING_NAPI_TRACKING_STATIC:
+		break;
+	default:
+		return -EINVAL;
+	}
+	/* clean the napi list for new settings */
+	io_napi_free(ctx);
+	WRITE_ONCE(ctx->napi_track_mode, napi->op_param);
+	WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC);
+	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll);
+	return 0;
+}
+
 /*
  * io_napi_register() - Register napi with io-uring
  * @ctx: pointer to io-uring context structure
@@ -226,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 {
 	const struct io_uring_napi curr = {
 		.busy_poll_to 	  = ktime_to_us(ctx->napi_busy_poll_dt),
-		.prefer_busy_poll = ctx->napi_prefer_busy_poll
+		.prefer_busy_poll = ctx->napi_prefer_busy_poll,
+		.op_param	  = ctx->napi_track_mode
 	};
 	struct io_uring_napi napi;
 
@@ -234,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg)
 		return -EINVAL;
 	if (copy_from_user(&napi, arg, sizeof(napi)))
 		return -EFAULT;
-	if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv)
+	if (napi.pad[0] || napi.pad[1] || napi.resv)
 		return -EINVAL;
 
 	if (copy_to_user(arg, &curr, sizeof(curr)))
 		return -EFAULT;
 
-	WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC);
-	WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll);
-	WRITE_ONCE(ctx->napi_enabled, true);
-	return 0;
+	switch (napi.opcode) {
+	case IO_URING_NAPI_REGISTER_OP:
+		return io_napi_register_napi(ctx, &napi);
+	case IO_URING_NAPI_STATIC_ADD_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_add_id(ctx, napi.op_param);
+	case IO_URING_NAPI_STATIC_DEL_ID:
+		if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC)
+			return -EINVAL;
+		return __io_napi_del_id(ctx, napi.op_param);
+	default:
+		return -EINVAL;
+	}
 }
 
 /*
@@ -266,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg)
 
 	WRITE_ONCE(ctx->napi_busy_poll_dt, 0);
 	WRITE_ONCE(ctx->napi_prefer_busy_poll, false);
-	WRITE_ONCE(ctx->napi_enabled, false);
+	WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE);
 	return 0;
 }
 
diff --git a/io_uring/napi.h b/io_uring/napi.h
index 4ae622f37b30..fa742f42e09b 100644
--- a/io_uring/napi.h
+++ b/io_uring/napi.h
@@ -44,7 +44,7 @@ static inline void io_napi_add(struct io_kiocb *req)
 	struct io_ring_ctx *ctx = req->ctx;
 	struct socket *sock;
 
-	if (!READ_ONCE(ctx->napi_enabled))
+	if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC)
 		return;
 
 	sock = sock_from_file(req->file);
-- 
cgit v1.2.3


From 83e041522eb9c45479f4490b212687cf1e7e9999 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:40 +0000
Subject: io_uring: temporarily disable registered waits

Disable wait argument registration as it'll be replaced with a more
generic feature. We'll still need IORING_ENTER_EXT_ARG_REG parsing
in a few commits so leave it be.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/70b1d1d218c41ba77a76d1789c8641dab0b0563e.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h | 10 ------
 include/uapi/linux/io_uring.h  |  3 --
 io_uring/io_uring.c            | 10 ------
 io_uring/register.c            | 82 ------------------------------------------
 io_uring/register.h            |  1 -
 5 files changed, 106 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 072e65e93105..52a5da99a205 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -330,14 +330,6 @@ struct io_ring_ctx {
 		atomic_t		cq_wait_nr;
 		atomic_t		cq_timeouts;
 		struct wait_queue_head	cq_wait;
-
-		/*
-		 * If registered with IORING_REGISTER_CQWAIT_REG, a single
-		 * page holds N entries, mapped in cq_wait_arg. cq_wait_index
-		 * is the maximum allowable index.
-		 */
-		struct io_uring_reg_wait	*cq_wait_arg;
-		unsigned char			cq_wait_index;
 	} ____cacheline_aligned_in_smp;
 
 	/* timeouts */
@@ -431,8 +423,6 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
-
-	struct page			**cq_wait_page;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5d08435b95a8..132f5db3d4e8 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,9 +627,6 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
-	/* register fixed io_uring_reg_wait arguments */
-	IORING_REGISTER_CQWAIT_REG		= 34,
-
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 464a70bde7e6..286b7bb73978 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,7 +2709,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
-	io_unregister_cqwait_reg(ctx);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
@@ -3195,15 +3194,6 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	struct io_uring_reg_wait *arg = READ_ONCE(ctx->cq_wait_arg);
-
-	if (arg) {
-		unsigned int index = (unsigned int) (uintptr_t) uarg;
-
-		if (index <= ctx->cq_wait_index)
-			return arg + index;
-	}
-
 	return ERR_PTR(-EFAULT);
 }
 
diff --git a/io_uring/register.c b/io_uring/register.c
index 45edfc57963a..3c5a3cfb186b 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,82 +570,6 @@ out:
 	return ret;
 }
 
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx)
-{
-	unsigned short npages = 1;
-
-	if (!ctx->cq_wait_page)
-		return;
-
-	io_pages_unmap(ctx->cq_wait_arg, &ctx->cq_wait_page, &npages, true);
-	ctx->cq_wait_arg = NULL;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-}
-
-/*
- * Register a page holding N entries of struct io_uring_reg_wait, which can
- * be used via io_uring_enter(2) if IORING_GETEVENTS_EXT_ARG_REG is set.
- * If that is set with IORING_GETEVENTS_EXT_ARG, then instead of passing
- * in a pointer for a struct io_uring_getevents_arg, an index into this
- * registered array is passed, avoiding two (arg + timeout) copies per
- * invocation.
- */
-static int io_register_cqwait_reg(struct io_ring_ctx *ctx, void __user *uarg)
-{
-	struct io_uring_cqwait_reg_arg arg;
-	struct io_uring_reg_wait *reg;
-	struct page **pages;
-	unsigned long len;
-	int nr_pages, poff;
-	int ret;
-
-	if (ctx->cq_wait_page || ctx->cq_wait_arg)
-		return -EBUSY;
-	if (copy_from_user(&arg, uarg, sizeof(arg)))
-		return -EFAULT;
-	if (!arg.nr_entries || arg.flags)
-		return -EINVAL;
-	if (arg.struct_size != sizeof(*reg))
-		return -EINVAL;
-	if (check_mul_overflow(arg.struct_size, arg.nr_entries, &len))
-		return -EOVERFLOW;
-	if (len > PAGE_SIZE)
-		return -EINVAL;
-	/* offset + len must fit within a page, and must be reg_wait aligned */
-	poff = arg.user_addr & ~PAGE_MASK;
-	if (len + poff > PAGE_SIZE)
-		return -EINVAL;
-	if (poff % arg.struct_size)
-		return -EINVAL;
-
-	pages = io_pin_pages(arg.user_addr, len, &nr_pages);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-	ret = -EINVAL;
-	if (nr_pages != 1)
-		goto out_free;
-	if (ctx->user) {
-		ret = __io_account_mem(ctx->user, 1);
-		if (ret)
-			goto out_free;
-	}
-
-	reg = vmap(pages, 1, VM_MAP, PAGE_KERNEL);
-	if (reg) {
-		ctx->cq_wait_index = arg.nr_entries - 1;
-		WRITE_ONCE(ctx->cq_wait_page, pages);
-		WRITE_ONCE(ctx->cq_wait_arg, (void *) reg + poff);
-		return 0;
-	}
-	ret = -ENOMEM;
-	if (ctx->user)
-		__io_unaccount_mem(ctx->user, 1);
-out_free:
-	io_pages_free(&pages, nr_pages);
-	return ret;
-}
-
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -840,12 +764,6 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
-	case IORING_REGISTER_CQWAIT_REG:
-		ret = -EINVAL;
-		if (!arg || nr_args != 1)
-			break;
-		ret = io_register_cqwait_reg(ctx, arg);
-		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/register.h b/io_uring/register.h
index 3e935e8fa4b2..a5f39d5ef9e0 100644
--- a/io_uring/register.h
+++ b/io_uring/register.h
@@ -5,6 +5,5 @@
 int io_eventfd_unregister(struct io_ring_ctx *ctx);
 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id);
 struct file *io_uring_register_get_file(unsigned int fd, bool registered);
-void io_unregister_cqwait_reg(struct io_ring_ctx *ctx);
 
 #endif
-- 
cgit v1.2.3


From dfbbfbf191878e8dd422768ce009858d8b5b761e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:41 +0000
Subject: io_uring: introduce concept of memory regions

We've got a good number of mappings we share with the userspace, that
includes the main rings, provided buffer rings, upcoming rings for
zerocopy rx and more. All of them duplicate user argument parsing and
some internal details as well (page pinnning, huge page optimisations,
mmap'ing, etc.)

Introduce a notion of regions. For userspace for now it's just a new
structure called struct io_uring_region_desc which is supposed to
parameterise all such mapping / queue creations. A region either
represents a user provided chunk of memory, in which case the user_addr
field should point to it, or a request for the kernel to allocate the
memory, in which case the user would need to mmap it after using the
offset returned in the mmap_offset field. With a uniform userspace API
we can avoid additional boiler plate code and apply future optimisation
to all of them at once.

Internally, there is a new structure struct io_mapped_region holding all
relevant runtime information and some helpers to work with it. This
patch limits it to user provided regions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0e6fe25818dfbaebd1bd90b870a6cac503fe1a24.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  6 ++++
 include/uapi/linux/io_uring.h  | 14 +++++++++
 io_uring/memmap.c              | 67 ++++++++++++++++++++++++++++++++++++++++++
 io_uring/memmap.h              | 14 +++++++++
 4 files changed, 101 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 52a5da99a205..1d3a37234ace 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -75,6 +75,12 @@ struct io_hash_table {
 	unsigned		hash_bits;
 };
 
+struct io_mapped_region {
+	struct page		**pages;
+	void			*vmap_ptr;
+	size_t			nr_pages;
+};
+
 /*
  * Arbitrary limit, can be raised if need be
  */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 132f5db3d4e8..5cbfd330c688 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -647,6 +647,20 @@ struct io_uring_files_update {
 	__aligned_u64 /* __s32 * */ fds;
 };
 
+enum {
+	/* initialise with user provided memory pointed by user_addr */
+	IORING_MEM_REGION_TYPE_USER		= 1,
+};
+
+struct io_uring_region_desc {
+	__u64 user_addr;
+	__u64 size;
+	__u32 flags;
+	__u32 id;
+	__u64 mmap_offset;
+	__u64 __resv[4];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index 6ab59c60dfd0..bbd9569a0120 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -12,6 +12,7 @@
 
 #include "memmap.h"
 #include "kbuf.h"
+#include "rsrc.h"
 
 static void *io_mem_alloc_compound(struct page **pages, int nr_pages,
 				   size_t size, gfp_t gfp)
@@ -194,6 +195,72 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
 	return ERR_PTR(-ENOMEM);
 }
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr)
+{
+	if (mr->pages) {
+		unpin_user_pages(mr->pages, mr->nr_pages);
+		kvfree(mr->pages);
+	}
+	if (mr->vmap_ptr)
+		vunmap(mr->vmap_ptr);
+	if (mr->nr_pages && ctx->user)
+		__io_unaccount_mem(ctx->user, mr->nr_pages);
+
+	memset(mr, 0, sizeof(*mr));
+}
+
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg)
+{
+	int pages_accounted = 0;
+	struct page **pages;
+	int nr_pages, ret;
+	void *vptr;
+	u64 end;
+
+	if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages))
+		return -EFAULT;
+	if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv)))
+		return -EINVAL;
+	if (reg->flags != IORING_MEM_REGION_TYPE_USER)
+		return -EINVAL;
+	if (!reg->user_addr)
+		return -EFAULT;
+	if (!reg->size || reg->mmap_offset || reg->id)
+		return -EINVAL;
+	if ((reg->size >> PAGE_SHIFT) > INT_MAX)
+		return E2BIG;
+	if ((reg->user_addr | reg->size) & ~PAGE_MASK)
+		return -EINVAL;
+	if (check_add_overflow(reg->user_addr, reg->size, &end))
+		return -EOVERFLOW;
+
+	pages = io_pin_pages(reg->user_addr, reg->size, &nr_pages);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	if (ctx->user) {
+		ret = __io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			goto out_free;
+		pages_accounted = nr_pages;
+	}
+
+	vptr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+	if (!vptr)
+		goto out_free;
+
+	mr->pages = pages;
+	mr->vmap_ptr = vptr;
+	mr->nr_pages = nr_pages;
+	return 0;
+out_free:
+	if (pages_accounted)
+		__io_unaccount_mem(ctx->user, pages_accounted);
+	io_pages_free(&pages, nr_pages);
+	return ret;
+}
+
 static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff,
 					    size_t sz)
 {
diff --git a/io_uring/memmap.h b/io_uring/memmap.h
index 5cec5b7ac49a..f361a635b6c7 100644
--- a/io_uring/memmap.h
+++ b/io_uring/memmap.h
@@ -22,4 +22,18 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr,
 					 unsigned long flags);
 int io_uring_mmap(struct file *file, struct vm_area_struct *vma);
 
+void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr);
+int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr,
+		     struct io_uring_region_desc *reg);
+
+static inline void *io_region_get_ptr(struct io_mapped_region *mr)
+{
+	return mr->vmap_ptr;
+}
+
+static inline bool io_region_is_set(struct io_mapped_region *mr)
+{
+	return !!mr->nr_pages;
+}
+
 #endif
-- 
cgit v1.2.3


From 93238e66185524aad925acefb2312203b9e26d63 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:42 +0000
Subject: io_uring: add memory region registration

Regions will serve multiple purposes. First, with it we can decouple
ring/etc. object creation from registration / mapping of the memory they
will be placed in. We already have hacks that allow to put both SQ and
CQ into the same huge page, in the future we should be able to:

region = create_region(io_ring);
create_pbuf_ring(io_uring, region, offset=0);
create_pbuf_ring(io_uring, region, offset=N);

The second use case is efficiently passing parameters. The following
patch enables back on top of regions IORING_ENTER_EXT_ARG_REG, which
optimises wait arguments. It'll also be useful for request arguments
replacing iovecs, msghdr, etc. pointers. Eventually it would also be
handy for BPF as well if it comes to fruition.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/0798cf3a14fad19cfc96fc9feca5f3e11481691d.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  8 ++++++++
 io_uring/io_uring.c            |  1 +
 io_uring/register.c            | 37 +++++++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1d3a37234ace..e1d69123e164 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -429,6 +429,9 @@ struct io_ring_ctx {
 	unsigned short			n_sqe_pages;
 	struct page			**ring_pages;
 	struct page			**sqe_pages;
+
+	/* used for optimised request parameter and wait argument passing  */
+	struct io_mapped_region		param_region;
 };
 
 struct io_tw_state {
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 5cbfd330c688..1ee35890125b 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -627,6 +627,8 @@ enum io_uring_register_op {
 	/* resize CQ ring */
 	IORING_REGISTER_RESIZE_RINGS		= 33,
 
+	IORING_REGISTER_MEM_REGION		= 34,
+
 	/* this goes last */
 	IORING_REGISTER_LAST,
 
@@ -661,6 +663,12 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+struct io_uring_mem_region_reg {
+	__u64 region_uptr; /* struct io_uring_region_desc * */
+	__u64 flags;
+	__u64 __resv[2];
+};
+
 /*
  * Register a fully sparse file space, rather than pass in an array of all
  * -1 file descriptors.
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 286b7bb73978..c640b8a4ceee 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2709,6 +2709,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
 	io_futex_cache_free(ctx);
 	io_destroy_buffers(ctx);
+	io_free_region(ctx, &ctx->param_region);
 	mutex_unlock(&ctx->uring_lock);
 	if (ctx->sq_creds)
 		put_cred(ctx->sq_creds);
diff --git a/io_uring/register.c b/io_uring/register.c
index 3c5a3cfb186b..2cbac3d9b288 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -570,6 +570,37 @@ out:
 	return ret;
 }
 
+static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
+{
+	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
+	struct io_uring_mem_region_reg reg;
+	struct io_uring_region_desc __user *rd_uptr;
+	struct io_uring_region_desc rd;
+	int ret;
+
+	if (io_region_is_set(&ctx->param_region))
+		return -EBUSY;
+	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
+		return -EFAULT;
+	rd_uptr = u64_to_user_ptr(reg.region_uptr);
+	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
+		return -EFAULT;
+
+	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
+		return -EINVAL;
+	if (reg.flags)
+		return -EINVAL;
+
+	ret = io_create_region(ctx, &ctx->param_region, &rd);
+	if (ret)
+		return ret;
+	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
+		io_free_region(ctx, &ctx->param_region);
+		return -EFAULT;
+	}
+	return 0;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -764,6 +795,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_register_resize_rings(ctx, arg);
 		break;
+	case IORING_REGISTER_MEM_REGION:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_mem_region(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
-- 
cgit v1.2.3


From d617b3147d54c42351eac63b5398d4ddf4f4011b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 15 Nov 2024 16:54:43 +0000
Subject: io_uring: restore back registered wait arguments

Now we've got a more generic region registration API, place
IORING_ENTER_EXT_ARG_REG and re-enable it.

First, the user has to register a region with the
IORING_MEM_REGION_REG_WAIT_ARG flag set. It can only be done for a
ring in a disabled state, aka IORING_SETUP_R_DISABLED, to avoid races
with already running waiters. With that we should have stable constant
values for ctx->cq_wait_{size,arg} in io_get_ext_arg_reg() and hence no
READ_ONCE required.

The other API difference is that we're now passing byte offsets instead
of indexes. The user _must_ align all offsets / pointers to the native
word size, failing to do so might but not necessarily has to lead to a
failure usually returned as -EFAULT. liburing will be hiding this
details from users.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/81822c1b4ffbe8ad391b4f9ad1564def0d26d990.1731689588.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  3 +++
 include/uapi/linux/io_uring.h  |  5 +++++
 io_uring/io_uring.c            | 14 +++++++++++++-
 io_uring/register.c            | 16 +++++++++++++++-
 4 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index e1d69123e164..aa5f5ea98076 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -324,6 +324,9 @@ struct io_ring_ctx {
 		unsigned		cq_entries;
 		struct io_ev_fd	__rcu	*io_ev_fd;
 		unsigned		cq_extra;
+
+		void			*cq_wait_arg;
+		size_t			cq_wait_size;
 	} ____cacheline_aligned_in_smp;
 
 	/*
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1ee35890125b..4418d0192959 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -663,6 +663,11 @@ struct io_uring_region_desc {
 	__u64 __resv[4];
 };
 
+enum {
+	/* expose the region as registered wait arguments */
+	IORING_MEM_REGION_REG_WAIT_ARG		= 1,
+};
+
 struct io_uring_mem_region_reg {
 	__u64 region_uptr; /* struct io_uring_region_desc * */
 	__u64 flags;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c640b8a4ceee..da8fd460977b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3195,7 +3195,19 @@ void __io_uring_cancel(bool cancel_all)
 static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
 			const struct io_uring_getevents_arg __user *uarg)
 {
-	return ERR_PTR(-EFAULT);
+	unsigned long size = sizeof(struct io_uring_reg_wait);
+	unsigned long offset = (uintptr_t)uarg;
+	unsigned long end;
+
+	if (unlikely(offset % sizeof(long)))
+		return ERR_PTR(-EFAULT);
+
+	/* also protects from NULL ->cq_wait_arg as the size would be 0 */
+	if (unlikely(check_add_overflow(offset, size, &end) ||
+		     end > ctx->cq_wait_size))
+		return ERR_PTR(-EFAULT);
+
+	return ctx->cq_wait_arg + offset;
 }
 
 static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags,
diff --git a/io_uring/register.c b/io_uring/register.c
index 2cbac3d9b288..1a60f4916649 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -588,7 +588,16 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 
 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
 		return -EINVAL;
-	if (reg.flags)
+	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
+		return -EINVAL;
+
+	/*
+	 * This ensures there are no waiters. Waiters are unlocked and it's
+	 * hard to synchronise with them, especially if we need to initialise
+	 * the region.
+	 */
+	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
+	    !(ctx->flags & IORING_SETUP_R_DISABLED))
 		return -EINVAL;
 
 	ret = io_create_region(ctx, &ctx->param_region, &rd);
@@ -598,6 +607,11 @@ static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
 		io_free_region(ctx, &ctx->param_region);
 		return -EFAULT;
 	}
+
+	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
+		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
+		ctx->cq_wait_size = rd.size;
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3