From caec5ebe77f97d948dcf46f07d622bda7f1f6dfd Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 9 May 2023 09:19:10 -0600
Subject: io_uring: rely solely on FMODE_NOWAIT

Now that we have both sockets and block devices setting FMODE_NOWAIT
appropriately, we can get rid of all the odd special casing in
__io_file_supports_nowait() and rely soley on FMODE_NOWAIT and
O_NONBLOCK rather than special case sockets and (in particular) bdevs.

Link: https://lore.kernel.org/r/20230509151910.183637-4-axboe@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..7c426584e35a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1758,11 +1758,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 	}
 }
 
-static bool io_bdev_nowait(struct block_device *bdev)
-{
-	return !bdev || bdev_nowait(bdev);
-}
-
 /*
  * If we tracked the file through the SCM inflight mechanism, we could support
  * any file. For now, just ensure that anything potentially problematic is done
@@ -1770,22 +1765,6 @@ static bool io_bdev_nowait(struct block_device *bdev)
  */
 static bool __io_file_supports_nowait(struct file *file, umode_t mode)
 {
-	if (S_ISBLK(mode)) {
-		if (IS_ENABLED(CONFIG_BLOCK) &&
-		    io_bdev_nowait(I_BDEV(file->f_mapping->host)))
-			return true;
-		return false;
-	}
-	if (S_ISSOCK(mode))
-		return true;
-	if (S_ISREG(mode)) {
-		if (IS_ENABLED(CONFIG_BLOCK) &&
-		    io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
-		    !io_is_uring_fops(file))
-			return true;
-		return false;
-	}
-
 	/* any ->read/write should understand O_NONBLOCK */
 	if (file->f_flags & O_NONBLOCK)
 		return true;
-- 
cgit v1.2.3


From 9b1b58cacc65ecee29bd85988c9ff957a84b43f4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 5 Nov 2021 17:11:34 -0600
Subject: io_uring: remove sq/cq_off memset

We only have two reserved members we're not clearing, do so manually
instead. This is in preparation for using one of these members for
a new feature.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7c426584e35a..13a7fcec1df8 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3866,7 +3866,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	if (ret)
 		goto err;
 
-	memset(&p->sq_off, 0, sizeof(p->sq_off));
 	p->sq_off.head = offsetof(struct io_rings, sq.head);
 	p->sq_off.tail = offsetof(struct io_rings, sq.tail);
 	p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask);
@@ -3874,8 +3873,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->sq_off.flags = offsetof(struct io_rings, sq_flags);
 	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
 	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
+	p->sq_off.resv1 = 0;
+	p->sq_off.resv2 = 0;
 
-	memset(&p->cq_off, 0, sizeof(p->cq_off));
 	p->cq_off.head = offsetof(struct io_rings, cq.head);
 	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
 	p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask);
@@ -3883,6 +3883,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->cq_off.overflow = offsetof(struct io_rings, cq_overflow);
 	p->cq_off.cqes = offsetof(struct io_rings, cqes);
 	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
+	p->cq_off.resv1 = 0;
+	p->cq_off.resv2 = 0;
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
-- 
cgit v1.2.3


From e27cef86a0edd4ef7f8b4670f508a03b509cbbb2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 5 Nov 2021 17:13:52 -0600
Subject: io_uring: return error pointer from io_mem_alloc()

In preparation for having more than one time of ring allocator, make the
existing one return valid/error-pointer rather than just NULL.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 13a7fcec1df8..aa4759e90a45 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2691,8 +2691,12 @@ static void io_mem_free(void *ptr)
 static void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
+	void *ret;
 
-	return (void *) __get_free_pages(gfp, get_order(size));
+	ret = (void *) __get_free_pages(gfp, get_order(size));
+	if (ret)
+		return ret;
+	return ERR_PTR(-ENOMEM);
 }
 
 static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
@@ -3652,6 +3656,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 {
 	struct io_rings *rings;
 	size_t size, sq_array_offset;
+	void *ptr;
 
 	/* make sure these are sane, as we already accounted them */
 	ctx->sq_entries = p->sq_entries;
@@ -3662,8 +3667,8 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 
 	rings = io_mem_alloc(size);
-	if (!rings)
-		return -ENOMEM;
+	if (IS_ERR(rings))
+		return PTR_ERR(rings);
 
 	ctx->rings = rings;
 	ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
@@ -3682,13 +3687,14 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 	}
 
-	ctx->sq_sqes = io_mem_alloc(size);
-	if (!ctx->sq_sqes) {
+	ptr = io_mem_alloc(size);
+	if (IS_ERR(ptr)) {
 		io_mem_free(ctx->rings);
 		ctx->rings = NULL;
-		return -ENOMEM;
+		return PTR_ERR(ptr);
 	}
 
+	ctx->sq_sqes = ptr;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 9c189eee73af1825ea9c895fafad469de5f82641 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 5 Nov 2021 17:15:46 -0600
Subject: io_uring: add ring freeing helper

We do rings and sqes separately, move them into a helper that does both
the freeing and clearing of the memory.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index aa4759e90a45..74433939a318 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2688,6 +2688,14 @@ static void io_mem_free(void *ptr)
 		free_compound_page(page);
 }
 
+static void io_rings_free(struct io_ring_ctx *ctx)
+{
+	io_mem_free(ctx->rings);
+	io_mem_free(ctx->sq_sqes);
+	ctx->rings = NULL;
+	ctx->sq_sqes = NULL;
+}
+
 static void *io_mem_alloc(size_t size)
 {
 	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP;
@@ -2852,8 +2860,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		mmdrop(ctx->mm_account);
 		ctx->mm_account = NULL;
 	}
-	io_mem_free(ctx->rings);
-	io_mem_free(ctx->sq_sqes);
+	io_rings_free(ctx);
 
 	percpu_ref_exit(&ctx->refs);
 	free_uid(ctx->user);
@@ -3682,15 +3689,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	else
 		size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
 	if (size == SIZE_MAX) {
-		io_mem_free(ctx->rings);
-		ctx->rings = NULL;
+		io_rings_free(ctx);
 		return -EOVERFLOW;
 	}
 
 	ptr = io_mem_alloc(size);
 	if (IS_ERR(ptr)) {
-		io_mem_free(ctx->rings);
-		ctx->rings = NULL;
+		io_rings_free(ctx);
 		return PTR_ERR(ptr);
 	}
 
-- 
cgit v1.2.3


From 03d89a2de25bbc5c77e61a0cf77663978c4b6ea7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 5 Nov 2021 17:20:54 -0600
Subject: io_uring: support for user allocated memory for rings/sqes

Currently io_uring applications must call mmap(2) twice to map the rings
themselves, and the sqes array. This works fine, but it does not support
using huge pages to back the rings/sqes.

Provide a way for the application to pass in pre-allocated memory for
the rings/sqes, which can then suitably be allocated from shmfs or
via mmap to get huge page support.

Particularly for larger rings, this reduces the TLBs needed.

If an application wishes to take advantage of that, it must pre-allocate
the memory needed for the sq/cq ring, and the sqes. The former must
be passed in via the io_uring_params->cq_off.user_data field, while the
latter is passed in via the io_uring_params->sq_off.user_data field. Then
it must set IORING_SETUP_NO_MMAP in the io_uring_params->flags field,
and io_uring will then map the existing memory into the kernel for shared
use. The application must not call mmap(2) to map rings as it otherwise
would have, that will now fail with -EINVAL if this setup flag was used.

The pages used for the rings and sqes must be contigious. The intent here
is clearly that huge pages should be used, otherwise the normal setup
procedure works fine as-is. The application may use one huge page for
both the rings and sqes.

Outside of those initialization changes, everything works like it did
before.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/io_uring_types.h |  10 ++++
 include/uapi/linux/io_uring.h  |   9 +++-
 io_uring/io_uring.c            | 106 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 114 insertions(+), 11 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 1b2a20a42413..f04ce513fadb 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -211,6 +211,16 @@ struct io_ring_ctx {
 		unsigned int		compat: 1;
 
 		enum task_work_notify_mode	notify_method;
+
+		/*
+		 * If IORING_SETUP_NO_MMAP is used, then the below holds
+		 * the gup'ed pages for the two rings, and the sqes.
+		 */
+		unsigned short		n_ring_pages;
+		unsigned short		n_sqe_pages;
+		struct page		**ring_pages;
+		struct page		**sqe_pages;
+
 		struct io_rings			*rings;
 		struct task_struct		*submitter_task;
 		struct percpu_ref		refs;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0716cb17e436..2edba9a274de 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -173,6 +173,11 @@ enum {
  */
 #define IORING_SETUP_DEFER_TASKRUN	(1U << 13)
 
+/*
+ * Application provides the memory for the rings
+ */
+#define IORING_SETUP_NO_MMAP		(1U << 14)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
@@ -406,7 +411,7 @@ struct io_sqring_offsets {
 	__u32 dropped;
 	__u32 array;
 	__u32 resv1;
-	__u64 resv2;
+	__u64 user_addr;
 };
 
 /*
@@ -425,7 +430,7 @@ struct io_cqring_offsets {
 	__u32 cqes;
 	__u32 flags;
 	__u32 resv1;
-	__u64 resv2;
+	__u64 user_addr;
 };
 
 /*
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 74433939a318..61379cf8e7f5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2688,12 +2688,85 @@ static void io_mem_free(void *ptr)
 		free_compound_page(page);
 }
 
+static void io_pages_free(struct page ***pages, int npages)
+{
+	struct page **page_array;
+	int i;
+
+	if (!pages)
+		return;
+	page_array = *pages;
+	for (i = 0; i < npages; i++)
+		unpin_user_page(page_array[i]);
+	kvfree(page_array);
+	*pages = NULL;
+}
+
+static void *__io_uaddr_map(struct page ***pages, unsigned short *npages,
+			    unsigned long uaddr, size_t size)
+{
+	struct page **page_array;
+	unsigned int nr_pages;
+	int ret;
+
+	*npages = 0;
+
+	if (uaddr & (PAGE_SIZE - 1) || !size)
+		return ERR_PTR(-EINVAL);
+
+	nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > USHRT_MAX)
+		return ERR_PTR(-EINVAL);
+	page_array = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL);
+	if (!page_array)
+		return ERR_PTR(-ENOMEM);
+
+	ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM,
+					page_array);
+	if (ret != nr_pages) {
+err:
+		io_pages_free(&page_array, ret > 0 ? ret : 0);
+		return ret < 0 ? ERR_PTR(ret) : ERR_PTR(-EFAULT);
+	}
+	/*
+	 * Should be a single page. If the ring is small enough that we can
+	 * use a normal page, that is fine. If we need multiple pages, then
+	 * userspace should use a huge page. That's the only way to guarantee
+	 * that we get contigious memory, outside of just being lucky or
+	 * (currently) having low memory fragmentation.
+	 */
+	if (page_array[0] != page_array[ret - 1])
+		goto err;
+	*pages = page_array;
+	*npages = nr_pages;
+	return page_to_virt(page_array[0]);
+}
+
+static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
+			  size_t size)
+{
+	return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
+				size);
+}
+
+static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
+			 size_t size)
+{
+	return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
+				size);
+}
+
 static void io_rings_free(struct io_ring_ctx *ctx)
 {
-	io_mem_free(ctx->rings);
-	io_mem_free(ctx->sq_sqes);
-	ctx->rings = NULL;
-	ctx->sq_sqes = NULL;
+	if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
+		io_mem_free(ctx->rings);
+		io_mem_free(ctx->sq_sqes);
+		ctx->rings = NULL;
+		ctx->sq_sqes = NULL;
+	} else {
+		io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
+		io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
+	}
 }
 
 static void *io_mem_alloc(size_t size)
@@ -3338,6 +3411,10 @@ static void *io_uring_validate_mmap_request(struct file *file,
 	struct page *page;
 	void *ptr;
 
+	/* Don't allow mmap if the ring was setup without it */
+	if (ctx->flags & IORING_SETUP_NO_MMAP)
+		return ERR_PTR(-EINVAL);
+
 	switch (offset & IORING_OFF_MMAP_MASK) {
 	case IORING_OFF_SQ_RING:
 	case IORING_OFF_CQ_RING:
@@ -3673,7 +3750,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	if (size == SIZE_MAX)
 		return -EOVERFLOW;
 
-	rings = io_mem_alloc(size);
+	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+		rings = io_mem_alloc(size);
+	else
+		rings = io_rings_map(ctx, p->cq_off.user_addr, size);
+
 	if (IS_ERR(rings))
 		return PTR_ERR(rings);
 
@@ -3693,7 +3774,11 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 		return -EOVERFLOW;
 	}
 
-	ptr = io_mem_alloc(size);
+	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+		ptr = io_mem_alloc(size);
+	else
+		ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
+
 	if (IS_ERR(ptr)) {
 		io_rings_free(ctx);
 		return PTR_ERR(ptr);
@@ -3885,7 +3970,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->sq_off.dropped = offsetof(struct io_rings, sq_dropped);
 	p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings;
 	p->sq_off.resv1 = 0;
-	p->sq_off.resv2 = 0;
+	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+		p->sq_off.user_addr = 0;
 
 	p->cq_off.head = offsetof(struct io_rings, cq.head);
 	p->cq_off.tail = offsetof(struct io_rings, cq.tail);
@@ -3895,7 +3981,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 	p->cq_off.cqes = offsetof(struct io_rings, cqes);
 	p->cq_off.flags = offsetof(struct io_rings, cq_flags);
 	p->cq_off.resv1 = 0;
-	p->cq_off.resv2 = 0;
+	if (!(ctx->flags & IORING_SETUP_NO_MMAP))
+		p->cq_off.user_addr = 0;
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
@@ -3961,7 +4048,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
-			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
+			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
+			IORING_SETUP_NO_MMAP))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
-- 
cgit v1.2.3


From 6e76ac595855db27bbdaef337173294a6fd6eb2c Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Sat, 29 Apr 2023 01:40:30 +0900
Subject: io_uring: Add io_uring_setup flag to pre-register ring fd and never
 install it

With IORING_REGISTER_USE_REGISTERED_RING, an application can register
the ring fd and use it via registered index rather than installed fd.
This allows using a registered ring for everything *except* the initial
mmap.

With IORING_SETUP_NO_MMAP, io_uring_setup uses buffers allocated by the
user, rather than requiring a subsequent mmap.

The combination of the two allows a user to operate *entirely* via a
registered ring fd, making it unnecessary to ever install the fd in the
first place. So, add a flag IORING_SETUP_REGISTERED_FD_ONLY to make
io_uring_setup register the fd and return a registered index, without
installing the fd.

This allows an application to avoid touching the fd table at all, and
allows a library to never even momentarily install a file descriptor.

This splits out an io_ring_add_registered_file helper from
io_ring_add_registered_fd, for use by io_uring_setup.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Link: https://lore.kernel.org/r/bc8f431bada371c183b95a83399628b605e978a3.1682699803.git.josh@joshtriplett.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/uapi/linux/io_uring.h |  7 +++++++
 io_uring/io_uring.c           | 37 ++++++++++++++++++++++---------------
 io_uring/io_uring.h           |  3 +++
 io_uring/tctx.c               | 31 ++++++++++++++++++++-----------
 4 files changed, 52 insertions(+), 26 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 2edba9a274de..f222d263bc55 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -178,6 +178,13 @@ enum {
  */
 #define IORING_SETUP_NO_MMAP		(1U << 14)
 
+/*
+ * Register the ring fd in itself for use with
+ * IORING_REGISTER_USE_REGISTERED_RING; return a registered fd index rather
+ * than an fd.
+ */
+#define IORING_SETUP_REGISTERED_FD_ONLY	(1U << 15)
+
 enum io_uring_op {
 	IORING_OP_NOP,
 	IORING_OP_READV,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 61379cf8e7f5..dab09f568294 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3788,19 +3788,13 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
 	return 0;
 }
 
-static int io_uring_install_fd(struct io_ring_ctx *ctx, struct file *file)
+static int io_uring_install_fd(struct file *file)
 {
-	int ret, fd;
+	int fd;
 
 	fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
 	if (fd < 0)
 		return fd;
-
-	ret = __io_uring_add_tctx_node(ctx);
-	if (ret) {
-		put_unused_fd(fd);
-		return ret;
-	}
 	fd_install(fd, file);
 	return fd;
 }
@@ -3840,6 +3834,7 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 				  struct io_uring_params __user *params)
 {
 	struct io_ring_ctx *ctx;
+	struct io_uring_task *tctx;
 	struct file *file;
 	int ret;
 
@@ -3851,6 +3846,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		entries = IORING_MAX_ENTRIES;
 	}
 
+	if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
+	    && !(p->flags & IORING_SETUP_NO_MMAP))
+		return -EINVAL;
+
 	/*
 	 * Use twice as many entries for the CQ ring. It's possible for the
 	 * application to drive a higher depth than the size of the SQ ring,
@@ -4007,22 +4006,30 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 	}
 
+	ret = __io_uring_add_tctx_node(ctx);
+	if (ret)
+		goto err_fput;
+	tctx = current->io_uring;
+
 	/*
 	 * Install ring fd as the very last thing, so we don't risk someone
 	 * having closed it before we finish setup
 	 */
-	ret = io_uring_install_fd(ctx, file);
-	if (ret < 0) {
-		/* fput will clean it up */
-		fput(file);
-		return ret;
-	}
+	if (p->flags & IORING_SETUP_REGISTERED_FD_ONLY)
+		ret = io_ring_add_registered_file(tctx, file, 0, IO_RINGFD_REG_MAX);
+	else
+		ret = io_uring_install_fd(file);
+	if (ret < 0)
+		goto err_fput;
 
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);
 	return ret;
+err_fput:
+	fput(file);
+	return ret;
 }
 
 /*
@@ -4049,7 +4056,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 			IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
 			IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
 			IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
-			IORING_SETUP_NO_MMAP))
+			IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY))
 		return -EINVAL;
 
 	return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 259bf798a390..9b8dfb3bb2b4 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -75,6 +75,9 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd);
 int io_uring_alloc_task_context(struct task_struct *task,
 				struct io_ring_ctx *ctx);
 
+int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
+				     int start, int end);
+
 int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
 int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
 int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 3a8d1dd97e1b..c043fe93a3f2 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -208,31 +208,40 @@ void io_uring_unreg_ringfd(void)
 	}
 }
 
-static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
 				     int start, int end)
 {
-	struct file *file;
 	int offset;
-
 	for (offset = start; offset < end; offset++) {
 		offset = array_index_nospec(offset, IO_RINGFD_REG_MAX);
 		if (tctx->registered_rings[offset])
 			continue;
 
-		file = fget(fd);
-		if (!file) {
-			return -EBADF;
-		} else if (!io_is_uring_fops(file)) {
-			fput(file);
-			return -EOPNOTSUPP;
-		}
 		tctx->registered_rings[offset] = file;
 		return offset;
 	}
-
 	return -EBUSY;
 }
 
+static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd,
+				     int start, int end)
+{
+	struct file *file;
+	int offset;
+
+	file = fget(fd);
+	if (!file) {
+		return -EBADF;
+	} else if (!io_is_uring_fops(file)) {
+		fput(file);
+		return -EOPNOTSUPP;
+	}
+	offset = io_ring_add_registered_file(tctx, file, start, end);
+	if (offset < 0)
+		fput(file);
+	return offset;
+}
+
 /*
  * Register a ring fd to avoid fdget/fdput for each io_uring_enter()
  * invocation. User passes in an array of struct io_uring_rsrc_update
-- 
cgit v1.2.3


From 3af0356c162c299a8216576b644eb72715e97cb2 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 19 May 2023 09:51:31 -0600
Subject: io_uring: maintain ordering for DEFER_TASKRUN tw list

We use lockless lists for the local and deferred task_work, which means
that when we queue up events for processing, we ultimately process them
in reverse order to how they were received. This usually doesn't matter,
but for some cases, it does seem to make a big difference. Do the right
thing and reverse the list before processing it, so that we know it's
processed in the same order in which it was received.

This makes a rather big difference for some medium load network tests,
where consistency of performance was a bit all over the place. Here's
a case that has 4 connections each doing two sends and receives:

io_uring port=10002: rps:161.13k Bps:  1.45M idle=256ms
io_uring port=10002: rps:107.27k Bps:  0.97M idle=413ms
io_uring port=10002: rps:136.98k Bps:  1.23M idle=321ms
io_uring port=10002: rps:155.58k Bps:  1.40M idle=268ms

and after the change:

io_uring port=10002: rps:205.48k Bps:  1.85M idle=140ms user=40ms
io_uring port=10002: rps:203.57k Bps:  1.83M idle=139ms user=20ms
io_uring port=10002: rps:218.79k Bps:  1.97M idle=106ms user=30ms
io_uring port=10002: rps:217.88k Bps:  1.96M idle=110ms user=20ms
io_uring port=10002: rps:222.31k Bps:  2.00M idle=101ms user=0ms
io_uring port=10002: rps:218.74k Bps:  1.97M idle=102ms user=20ms
io_uring port=10002: rps:208.43k Bps:  1.88M idle=125ms user=40ms

using more of the time to actually process work rather than sitting
idle.

No effects have been observed at the peak end of the spectrum, where
performance is still the same even with deep batch depths (and hence
more items to sort).

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index dab09f568294..c99a7a0c3f21 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1405,7 +1405,11 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts)
 	if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
 		atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
 again:
-	node = io_llist_xchg(&ctx->work_llist, NULL);
+	/*
+	 * llists are in reverse order, flip it back the right way before
+	 * running the pending items.
+	 */
+	node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL));
 	while (node) {
 		struct llist_node *next = node->next;
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
-- 
cgit v1.2.3


From c92fcfc2bab54451c4f1481755ea244f413455cb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 2 Jun 2023 08:41:46 -0600
Subject: io_uring: avoid indirect function calls for the hottest task_work

We use task_work for a variety of reasons, but doing completions or
triggering rety after poll are by far the hottest two. Use the indirect
funtion call wrappers to avoid the indirect function call if
CONFIG_RETPOLINE is set.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 9 +++++++--
 io_uring/poll.c     | 2 +-
 io_uring/poll.h     | 2 ++
 io_uring/rw.c       | 2 +-
 io_uring/rw.h       | 1 +
 5 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index c99a7a0c3f21..fc511cb6761d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -95,6 +95,7 @@
 
 #include "timeout.h"
 #include "poll.h"
+#include "rw.h"
 #include "alloc_cache.h"
 
 #define IORING_MAX_ENTRIES	32768
@@ -1205,7 +1206,9 @@ static unsigned int handle_tw_list(struct llist_node *node,
 			ts->locked = mutex_trylock(&(*ctx)->uring_lock);
 			percpu_ref_get(&(*ctx)->refs);
 		}
-		req->io_task_work.func(req, ts);
+		INDIRECT_CALL_2(req->io_task_work.func,
+				io_poll_task_func, io_req_rw_complete,
+				req, ts);
 		node = next;
 		count++;
 		if (unlikely(need_resched())) {
@@ -1415,7 +1418,9 @@ again:
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    io_task_work.node);
 		prefetch(container_of(next, struct io_kiocb, io_task_work.node));
-		req->io_task_work.func(req, ts);
+		INDIRECT_CALL_2(req->io_task_work.func,
+				io_poll_task_func, io_req_rw_complete,
+				req, ts);
 		ret++;
 		node = next;
 	}
diff --git a/io_uring/poll.c b/io_uring/poll.c
index c90e47dc1e29..9689806d3c16 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -326,7 +326,7 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 	return IOU_POLL_NO_ACTION;
 }
 
-static void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
+void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	int ret;
 
diff --git a/io_uring/poll.h b/io_uring/poll.h
index b2393b403a2c..ff4d5d753387 100644
--- a/io_uring/poll.h
+++ b/io_uring/poll.h
@@ -38,3 +38,5 @@ bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk,
 			bool cancel_all);
 
 void io_apoll_cache_free(struct io_cache_entry *entry);
+
+void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 3f118ed46e4f..c23d8baf0287 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -283,7 +283,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res)
 	return res;
 }
 
-static void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
+void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts)
 {
 	io_req_io_end(req);
 
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3b733f4b610a..4b89f9659366 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -22,3 +22,4 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags);
 int io_writev_prep_async(struct io_kiocb *req);
 void io_readv_writev_cleanup(struct io_kiocb *req);
 void io_rw_fail(struct io_kiocb *req);
+void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts);
-- 
cgit v1.2.3


From d86eaed185e9c6052d1ee2ca538f1936ff255887 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Jun 2023 14:41:20 -0600
Subject: io_uring: cleanup io_aux_cqe() API

Everybody is passing in the request, so get rid of the io_ring_ctx and
explicit user_data pass-in. Both the ctx and user_data can be deduced
from the request at hand.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 4 +++-
 io_uring/io_uring.h | 2 +-
 io_uring/net.c      | 9 ++++-----
 io_uring/poll.c     | 4 ++--
 io_uring/timeout.c  | 4 ++--
 5 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index fc511cb6761d..08574a86da72 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -935,9 +935,11 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags
 	return __io_post_aux_cqe(ctx, user_data, res, cflags, true);
 }
 
-bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
 		bool allow_overflow)
 {
+	struct io_ring_ctx *ctx = req->ctx;
+	u64 user_data = req->cqe.user_data;
 	struct io_uring_cqe *cqe;
 	unsigned int length;
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9b8dfb3bb2b4..a937b4b75aee 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -47,7 +47,7 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx);
 void io_req_defer_failed(struct io_kiocb *req, s32 res);
 void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags);
 bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags);
-bool io_aux_cqe(struct io_ring_ctx *ctx, bool defer, u64 user_data, s32 res, u32 cflags,
+bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
 		bool allow_overflow);
 void __io_commit_cqring_flush(struct io_ring_ctx *ctx);
 
diff --git a/io_uring/net.c b/io_uring/net.c
index 0795f3783013..369167e45fa8 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -632,8 +632,8 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
 	}
 
 	if (!mshot_finished) {
-		if (io_aux_cqe(req->ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
-			       req->cqe.user_data, *ret, cflags | IORING_CQE_F_MORE, true)) {
+		if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER,
+			       *ret, cflags | IORING_CQE_F_MORE, true)) {
 			io_recv_prep_retry(req);
 			/* Known not-empty or unknown state, retry */
 			if (cflags & IORING_CQE_F_SOCK_NONEMPTY ||
@@ -1304,7 +1304,6 @@ int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 int io_accept(struct io_kiocb *req, unsigned int issue_flags)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_accept *accept = io_kiocb_to_cmd(req, struct io_accept);
 	bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -1354,8 +1353,8 @@ retry:
 
 	if (ret < 0)
 		return ret;
-	if (io_aux_cqe(ctx, issue_flags & IO_URING_F_COMPLETE_DEFER,
-		       req->cqe.user_data, ret, IORING_CQE_F_MORE, true))
+	if (io_aux_cqe(req, issue_flags & IO_URING_F_COMPLETE_DEFER, ret,
+		       IORING_CQE_F_MORE, true))
 		goto retry;
 
 	return -ECANCELED;
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 9689806d3c16..6b9179e8228e 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -300,8 +300,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts)
 			__poll_t mask = mangle_poll(req->cqe.res &
 						    req->apoll_events);
 
-			if (!io_aux_cqe(req->ctx, ts->locked, req->cqe.user_data,
-					mask, IORING_CQE_F_MORE, false)) {
+			if (!io_aux_cqe(req, ts->locked, mask,
+					IORING_CQE_F_MORE, false)) {
 				io_req_set_res(req, mask, 0);
 				return IOU_POLL_REMOVE_POLL_USE_RES;
 			}
diff --git a/io_uring/timeout.c b/io_uring/timeout.c
index 350eb830b485..fb0547b35dcd 100644
--- a/io_uring/timeout.c
+++ b/io_uring/timeout.c
@@ -73,8 +73,8 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts)
 
 	if (!io_timeout_finish(timeout, data)) {
 		bool filled;
-		filled = io_aux_cqe(ctx, ts->locked, req->cqe.user_data, -ETIME,
-				    IORING_CQE_F_MORE, false);
+		filled = io_aux_cqe(req, ts->locked, -ETIME, IORING_CQE_F_MORE,
+				    false);
 		if (filled) {
 			/* re-arm timer */
 			spin_lock_irq(&ctx->timeout_lock);
-- 
cgit v1.2.3


From 003f242b0dc16b287e6d15833d1d7f4adfa346ff Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 7 Jun 2023 15:00:07 -0600
Subject: io_uring: get rid of unnecessary 'length' variable

Just use the ARRAY_SIZE directly, we don't use length for anything else
in this function.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 08574a86da72..a467064da1af 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -941,16 +941,13 @@ bool io_aux_cqe(const struct io_kiocb *req, bool defer, s32 res, u32 cflags,
 	struct io_ring_ctx *ctx = req->ctx;
 	u64 user_data = req->cqe.user_data;
 	struct io_uring_cqe *cqe;
-	unsigned int length;
 
 	if (!defer)
 		return __io_post_aux_cqe(ctx, user_data, res, cflags, allow_overflow);
 
-	length = ARRAY_SIZE(ctx->submit_state.cqes);
-
 	lockdep_assert_held(&ctx->uring_lock);
 
-	if (ctx->submit_state.cqes_count == length) {
+	if (ctx->submit_state.cqes_count == ARRAY_SIZE(ctx->submit_state.cqes)) {
 		__io_cq_lock(ctx);
 		__io_flush_post_cqes(ctx);
 		/* no need to flush - flush is deferred */
-- 
cgit v1.2.3


From 4826c59453b3b4677d6bf72814e7ababdea86949 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sun, 11 Jun 2023 21:14:09 -0600
Subject: io_uring: wait interruptibly for request completions on exit

WHen the ring exits, cleanup is done and the final cancelation and
waiting on completions is done by io_ring_exit_work. That function is
invoked by kworker, which doesn't take any signals. Because of that, it
doesn't really matter if we wait for completions in TASK_INTERRUPTIBLE
or TASK_UNINTERRUPTIBLE state. However, it does matter to the hung task
detection checker!

Normally we expect cancelations and completions to happen rather
quickly. Some test cases, however, will exit the ring and park the
owning task stopped (eg via SIGSTOP). If the owning task needs to run
task_work to complete requests, then io_ring_exit_work won't make any
progress until the task is runnable again. Hence io_ring_exit_work can
trigger the hung task detection, which is particularly problematic if
panic-on-hung-task is enabled.

As the ring exit doesn't take signals to begin with, have it wait
interruptibly rather than uninterruptibly. io_uring has a separate
stuck-exit warning that triggers independently anyway, so we're not
really missing anything by making this switch.

Cc: stable@vger.kernel.org # 5.10+
Link: https://lore.kernel.org/r/b0e4aaef-7088-56ce-244c-976edeac0e66@kernel.dk
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index a467064da1af..f181876e415b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3121,7 +3121,18 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 			/* there is little hope left, don't run it too often */
 			interval = HZ * 60;
 		}
-	} while (!wait_for_completion_timeout(&ctx->ref_comp, interval));
+		/*
+		 * This is really an uninterruptible wait, as it has to be
+		 * complete. But it's also run from a kworker, which doesn't
+		 * take signals, so it's fine to make it interruptible. This
+		 * avoids scenarios where we knowingly can wait much longer
+		 * on completions, for example if someone does a SIGSTOP on
+		 * a task that needs to finish task_work to make this loop
+		 * complete. That's a synthetic situation that should not
+		 * cause a stuck task backtrace, and hence a potential panic
+		 * on stuck tasks if that is enabled.
+		 */
+	} while (!wait_for_completion_interruptible_timeout(&ctx->ref_comp, interval));
 
 	init_completion(&exit.completion);
 	init_task_work(&exit.task_work, io_tctx_exit_cb);
@@ -3145,7 +3156,12 @@ static __cold void io_ring_exit_work(struct work_struct *work)
 			continue;
 
 		mutex_unlock(&ctx->uring_lock);
-		wait_for_completion(&exit.completion);
+		/*
+		 * See comment above for
+		 * wait_for_completion_interruptible_timeout() on why this
+		 * wait is marked as interruptible.
+		 */
+		wait_for_completion_interruptible(&exit.completion);
 		mutex_lock(&ctx->uring_lock);
 	}
 	mutex_unlock(&ctx->uring_lock);
-- 
cgit v1.2.3


From b9a6c9459a5aec7bfd9b763554d15148367f1806 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:28 +0200
Subject: io_uring: remove __io_file_supports_nowait
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Now that this only checks O_NONBLOCK and FMODE_NOWAIT, the helper is
complete overkilļ, and the comments are confusing bordering to wrong.
Just inline the check into the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-2-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f181876e415b..7e735724940f 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1766,19 +1766,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 	}
 }
 
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
-static bool __io_file_supports_nowait(struct file *file, umode_t mode)
-{
-	/* any ->read/write should understand O_NONBLOCK */
-	if (file->f_flags & O_NONBLOCK)
-		return true;
-	return file->f_mode & FMODE_NOWAIT;
-}
-
 /*
  * If we tracked the file through the SCM inflight mechanism, we could support
  * any file. For now, just ensure that anything potentially problematic is done
@@ -1791,7 +1778,7 @@ unsigned int io_file_get_flags(struct file *file)
 
 	if (S_ISREG(mode))
 		res |= FFS_ISREG;
-	if (__io_file_supports_nowait(file, mode))
+	if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
 		res |= FFS_NOWAIT;
 	return res;
 }
-- 
cgit v1.2.3


From 53cfd5cea7f36bac7f3d45de4fea77e0c8d57aee Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:29 +0200
Subject: io_uring: remove the mode variable in io_file_get_flags

The variable is only once now, so don't bother with it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 7e735724940f..2d13f636de93 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1773,10 +1773,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
  */
 unsigned int io_file_get_flags(struct file *file)
 {
-	umode_t mode = file_inode(file)->i_mode;
 	unsigned int res = 0;
 
-	if (S_ISREG(mode))
+	if (S_ISREG(file_inode(file)->i_mode))
 		res |= FFS_ISREG;
 	if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
 		res |= FFS_NOWAIT;
-- 
cgit v1.2.3


From b57c7cd1c17616ae9db5614525ba703f384afd05 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:30 +0200
Subject: io_uring: remove a confusing comment above io_file_get_flags

The SCM inflight mechanism has nothing to do with the fact that a file
might be a regular file or not and if it supports non-blocking
operations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-4-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2d13f636de93..79f3cabec5b9 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1766,11 +1766,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
 	}
 }
 
-/*
- * If we tracked the file through the SCM inflight mechanism, we could support
- * any file. For now, just ensure that anything potentially problematic is done
- * inline.
- */
 unsigned int io_file_get_flags(struct file *file)
 {
 	unsigned int res = 0;
-- 
cgit v1.2.3


From 3beed235d1a1d0a4ab093ab67ea6b2841e9d4fa2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:31 +0200
Subject: io_uring: remove io_req_ffs_set

Just checking the flag directly makes it a lot more obvious what is
going on here.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-5-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 5 -----
 io_uring/rw.c       | 2 +-
 3 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 79f3cabec5b9..0e0bdb6ac9a2 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -424,7 +424,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 	if (req->flags & REQ_F_FORCE_ASYNC)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
-	if (req->file && !io_req_ffs_set(req))
+	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
 	if (req->file && (req->flags & REQ_F_ISREG)) {
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index a937b4b75aee..9718897133db 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -57,11 +57,6 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd);
 struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 			       unsigned issue_flags);
 
-static inline bool io_req_ffs_set(struct io_kiocb *req)
-{
-	return req->flags & REQ_F_FIXED_FILE;
-}
-
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags);
 bool io_is_uring_fops(struct file *file);
 bool io_alloc_async_data(struct io_kiocb *req);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c23d8baf0287..1cf5742f2ae9 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -666,7 +666,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 	if (unlikely(!file || !(file->f_mode & mode)))
 		return -EBADF;
 
-	if (!io_req_ffs_set(req))
+	if (!(req->flags & REQ_F_FIXED_FILE))
 		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
 
 	kiocb->ki_flags = file->f_iocb_flags;
-- 
cgit v1.2.3


From 8487f083c6ff6e02b2ec14f22ef2b0079a1b6425 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:32 +0200
Subject: io_uring: return REQ_F_ flags from io_file_get_flags

Two of the three callers want them, so return the more usual format,
and shift into the FFS_ form only for the fixed file table.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-6-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.h | 6 ++----
 io_uring/io_uring.c  | 6 +++---
 io_uring/rw.c        | 2 +-
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 351111ff8882..697cb68adc81 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -54,10 +54,8 @@ static inline struct file *io_file_from_index(struct io_file_table *table,
 static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
 				     struct file *file)
 {
-	unsigned long file_ptr = (unsigned long) file;
-
-	file_ptr |= io_file_get_flags(file);
-	file_slot->file_ptr = file_ptr;
+	file_slot->file_ptr = (unsigned long)file |
+		(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
 }
 
 static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx)
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0e0bdb6ac9a2..1f348753694b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -425,7 +425,7 @@ static void io_prep_async_work(struct io_kiocb *req)
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 
 	if (req->file && !(req->flags & REQ_F_FIXED_FILE))
-		req->flags |= io_file_get_flags(req->file) << REQ_F_SUPPORT_NOWAIT_BIT;
+		req->flags |= io_file_get_flags(req->file);
 
 	if (req->file && (req->flags & REQ_F_ISREG)) {
 		bool should_hash = def->hash_reg_file;
@@ -1771,9 +1771,9 @@ unsigned int io_file_get_flags(struct file *file)
 	unsigned int res = 0;
 
 	if (S_ISREG(file_inode(file)->i_mode))
-		res |= FFS_ISREG;
+		res |= REQ_F_ISREG;
 	if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
-		res |= FFS_NOWAIT;
+		res |= REQ_F_SUPPORT_NOWAIT;
 	return res;
 }
 
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 1cf5742f2ae9..1bce2208b65c 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -667,7 +667,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
 		return -EBADF;
 
 	if (!(req->flags & REQ_F_FIXED_FILE))
-		req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
+		req->flags |= io_file_get_flags(file);
 
 	kiocb->ki_flags = file->f_iocb_flags;
 	ret = kiocb_set_rw_flags(kiocb, rw->flags);
-- 
cgit v1.2.3


From 4bfb0c9af832a182a54e549123a634e0070c8d4f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Jun 2023 13:32:35 +0200
Subject: io_uring: add helpers to decode the fixed file file_ptr

Remove all the open coded magic on slot->file_ptr by introducing two
helpers that return the file pointer and the flags instead.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230620113235.920399-9-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/filetable.c | 11 ++++-------
 io_uring/filetable.h | 22 +++++++++++++++-------
 io_uring/io_uring.c  | 10 ++++------
 io_uring/rsrc.c      |  8 ++++----
 4 files changed, 27 insertions(+), 24 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 0f6fa791a47d..e7d749991de4 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -78,10 +78,8 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
 	file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
 
 	if (file_slot->file_ptr) {
-		struct file *old_file;
-
-		old_file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-		ret = io_queue_rsrc_removal(ctx->file_data, slot_index, old_file);
+		ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
+					    io_slot_file(file_slot));
 		if (ret)
 			return ret;
 
@@ -140,7 +138,6 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
 int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 {
 	struct io_fixed_file *file_slot;
-	struct file *file;
 	int ret;
 
 	if (unlikely(!ctx->file_data))
@@ -153,8 +150,8 @@ int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
 	if (!file_slot->file_ptr)
 		return -EBADF;
 
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, offset, file);
+	ret = io_queue_rsrc_removal(ctx->file_data, offset,
+				    io_slot_file(file_slot));
 	if (ret)
 		return ret;
 
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index 697cb68adc81..b47adf170c31 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -5,10 +5,6 @@
 #include <linux/file.h>
 #include <linux/io_uring_types.h>
 
-#define FFS_NOWAIT		0x1UL
-#define FFS_ISREG		0x2UL
-#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
-
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
 
@@ -43,12 +39,24 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
 	return &table->files[i];
 }
 
+#define FFS_NOWAIT		0x1UL
+#define FFS_ISREG		0x2UL
+#define FFS_MASK		~(FFS_NOWAIT|FFS_ISREG)
+
+static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
+{
+	return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
+}
+
+static inline struct file *io_slot_file(struct io_fixed_file *slot)
+{
+	return (struct file *)(slot->file_ptr & FFS_MASK);
+}
+
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
-	struct io_fixed_file *slot = io_fixed_file_slot(table, index);
-
-	return (struct file *) (slot->file_ptr & FFS_MASK);
+	return io_slot_file(io_fixed_file_slot(table, index));
 }
 
 static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1f348753694b..ae4cb3c4e730 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2028,19 +2028,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
 				      unsigned int issue_flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_fixed_file *slot;
 	struct file *file = NULL;
-	unsigned long file_ptr;
 
 	io_ring_submit_lock(ctx, issue_flags);
 
 	if (unlikely((unsigned int)fd >= ctx->nr_user_files))
 		goto out;
 	fd = array_index_nospec(fd, ctx->nr_user_files);
-	file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr;
-	file = (struct file *) (file_ptr & FFS_MASK);
-	file_ptr &= ~FFS_MASK;
-	/* mask in overlapping REQ_F and FFS bits */
-	req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT);
+	slot = io_fixed_file_slot(&ctx->file_table, fd);
+	file = io_slot_file(slot);
+	req->flags |= io_slot_flags(slot);
 	io_req_set_rsrc_node(req, ctx, 0);
 out:
 	io_ring_submit_unlock(ctx, issue_flags);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index d46f72a5ef73..a2dce7ef3a78 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -354,7 +354,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 	__s32 __user *fds = u64_to_user_ptr(up->data);
 	struct io_rsrc_data *data = ctx->file_data;
 	struct io_fixed_file *file_slot;
-	struct file *file;
 	int fd, i, err = 0;
 	unsigned int done;
 
@@ -382,15 +381,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 		file_slot = io_fixed_file_slot(&ctx->file_table, i);
 
 		if (file_slot->file_ptr) {
-			file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-			err = io_queue_rsrc_removal(data, i, file);
+			err = io_queue_rsrc_removal(data, i,
+						    io_slot_file(file_slot));
 			if (err)
 				break;
 			file_slot->file_ptr = 0;
 			io_file_bitmap_clear(&ctx->file_table, i);
 		}
 		if (fd != -1) {
-			file = fget(fd);
+			struct file *file = fget(fd);
+
 			if (!file) {
 				err = -EBADF;
 				break;
-- 
cgit v1.2.3


From 247f97a5f19b642eba5f5c1cf95fc3169326b3fb Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:21 +0100
Subject: io_uring: open code io_put_req_find_next

There is only one user of io_put_req_find_next() and it doesn't make
much sense to have it. Open code the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/38b5c5e48e4adc8e6a0cd16fdd5c1531d7ff81a9.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 25 +++++++------------------
 1 file changed, 7 insertions(+), 18 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index ae4cb3c4e730..b488a03ba009 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1586,22 +1586,6 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 	}
 }
 
-/*
- * Drop reference to request, return next in chain (if there is one) if this
- * was the last reference to this request.
- */
-static inline struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
-{
-	struct io_kiocb *nxt = NULL;
-
-	if (req_ref_put_and_test(req)) {
-		if (unlikely(req->flags & IO_REQ_LINK_FLAGS))
-			nxt = io_req_find_next(req);
-		io_free_req(req);
-	}
-	return nxt;
-}
-
 static unsigned io_cqring_events(struct io_ring_ctx *ctx)
 {
 	/* See comment at the top of this file */
@@ -1954,9 +1938,14 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
 struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *nxt = NULL;
 
-	req = io_put_req_find_next(req);
-	return req ? &req->work : NULL;
+	if (req_ref_put_and_test(req)) {
+		if (req->flags & IO_REQ_LINK_FLAGS)
+			nxt = io_req_find_next(req);
+		io_free_req(req);
+	}
+	return nxt ? &nxt->work : NULL;
 }
 
 void io_wq_submit_work(struct io_wq_work *work)
-- 
cgit v1.2.3


From 6ec9afc7f4cba58ab740c59d4c964d9422e2ea82 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:22 +0100
Subject: io_uring: remove io_free_req_tw

Request completion is a very hot path in general, but there are 3 places
that can be doing it: io_free_batch_list(), io_req_complete_post() and
io_free_req_tw().

io_free_req_tw() is used rather marginally and we don't care about it.
Killing it can help to clean up and optimise the left two, do that by
replacing it with io_req_task_complete().

There are two things to consider:
1) io_free_req() is called when all refs are put, so we need to reinit
   references. The easiest way to do that is to clear REQ_F_REFCOUNT.
2) We also don't need a cqe from it, so silence it with REQ_F_CQE_SKIP.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/434a2be8f33d474ad888ce1c17fe5ea7bbcb2a55.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 23 +++++------------------
 1 file changed, 5 insertions(+), 18 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b488a03ba009..43805d2621f5 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1121,26 +1121,13 @@ static inline void io_dismantle_req(struct io_kiocb *req)
 		io_put_file(req->file);
 }
 
-static __cold void io_free_req_tw(struct io_kiocb *req, struct io_tw_state *ts)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	if (req->rsrc_node) {
-		io_tw_lock(ctx, ts);
-		io_put_rsrc_node(ctx, req->rsrc_node);
-	}
-	io_dismantle_req(req);
-	io_put_task_remote(req->task, 1);
-
-	spin_lock(&ctx->completion_lock);
-	wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
-	ctx->locked_free_nr++;
-	spin_unlock(&ctx->completion_lock);
-}
-
 __cold void io_free_req(struct io_kiocb *req)
 {
-	req->io_task_work.func = io_free_req_tw;
+	/* refs were already put, restore them for io_req_task_complete() */
+	req->flags &= ~REQ_F_REFCOUNT;
+	/* we only want to free it, don't post CQEs */
+	req->flags |= REQ_F_CQE_SKIP;
+	req->io_task_work.func = io_req_task_complete;
 	io_req_task_work_add(req);
 }
 
-- 
cgit v1.2.3


From 3b7a612fd0dbd321e15a308b8ac1f8bbf81432bd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:23 +0100
Subject: io_uring: inline io_dismantle_req()

io_dismantle_req() is only used in __io_req_complete_post(), open code
it there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/ba8f20cb2c914eefa2e7d120a104a198552050db.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 43805d2621f5..50fe345bdced 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -146,7 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
 
-static void io_dismantle_req(struct io_kiocb *req);
 static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
 static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
@@ -991,7 +990,11 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 			}
 		}
 		io_put_kbuf_comp(req);
-		io_dismantle_req(req);
+		if (unlikely(req->flags & IO_REQ_CLEAN_FLAGS))
+			io_clean_op(req);
+		if (!(req->flags & REQ_F_FIXED_FILE))
+			io_put_file(req->file);
+
 		rsrc_node = req->rsrc_node;
 		/*
 		 * Selected buffer deallocation in io_clean_op() assumes that
@@ -1111,16 +1114,6 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
 	return true;
 }
 
-static inline void io_dismantle_req(struct io_kiocb *req)
-{
-	unsigned int flags = req->flags;
-
-	if (unlikely(flags & IO_REQ_CLEAN_FLAGS))
-		io_clean_op(req);
-	if (!(flags & REQ_F_FIXED_FILE))
-		io_put_file(req->file);
-}
-
 __cold void io_free_req(struct io_kiocb *req)
 {
 	/* refs were already put, restore them for io_req_task_complete() */
-- 
cgit v1.2.3


From 5a754dea27fb91a418f7429e24479e4184dee2e3 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:24 +0100
Subject: io_uring: move io_clean_op()

Move io_clean_op() up in the source file and remove the forward
declaration, as the function doesn't have tricky dependencies
anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/1b7163b2ba7c3a8322d972c79c1b0a9301b3057e.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 67 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 34 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 50fe345bdced..4d8613996644 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -146,7 +146,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 					 struct task_struct *task,
 					 bool cancel_all);
 
-static void io_clean_op(struct io_kiocb *req);
 static void io_queue_sqe(struct io_kiocb *req);
 static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
@@ -367,6 +366,39 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq)
 	return false;
 }
 
+static void io_clean_op(struct io_kiocb *req)
+{
+	if (req->flags & REQ_F_BUFFER_SELECTED) {
+		spin_lock(&req->ctx->completion_lock);
+		io_put_kbuf_comp(req);
+		spin_unlock(&req->ctx->completion_lock);
+	}
+
+	if (req->flags & REQ_F_NEED_CLEANUP) {
+		const struct io_cold_def *def = &io_cold_defs[req->opcode];
+
+		if (def->cleanup)
+			def->cleanup(req);
+	}
+	if ((req->flags & REQ_F_POLLED) && req->apoll) {
+		kfree(req->apoll->double_poll);
+		kfree(req->apoll);
+		req->apoll = NULL;
+	}
+	if (req->flags & REQ_F_INFLIGHT) {
+		struct io_uring_task *tctx = req->task->io_uring;
+
+		atomic_dec(&tctx->inflight_tracked);
+	}
+	if (req->flags & REQ_F_CREDS)
+		put_cred(req->creds);
+	if (req->flags & REQ_F_ASYNC_DATA) {
+		kfree(req->async_data);
+		req->async_data = NULL;
+	}
+	req->flags &= ~IO_REQ_CLEAN_FLAGS;
+}
+
 static inline void io_req_track_inflight(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_INFLIGHT)) {
@@ -1823,39 +1855,6 @@ queue:
 	spin_unlock(&ctx->completion_lock);
 }
 
-static void io_clean_op(struct io_kiocb *req)
-{
-	if (req->flags & REQ_F_BUFFER_SELECTED) {
-		spin_lock(&req->ctx->completion_lock);
-		io_put_kbuf_comp(req);
-		spin_unlock(&req->ctx->completion_lock);
-	}
-
-	if (req->flags & REQ_F_NEED_CLEANUP) {
-		const struct io_cold_def *def = &io_cold_defs[req->opcode];
-
-		if (def->cleanup)
-			def->cleanup(req);
-	}
-	if ((req->flags & REQ_F_POLLED) && req->apoll) {
-		kfree(req->apoll->double_poll);
-		kfree(req->apoll);
-		req->apoll = NULL;
-	}
-	if (req->flags & REQ_F_INFLIGHT) {
-		struct io_uring_task *tctx = req->task->io_uring;
-
-		atomic_dec(&tctx->inflight_tracked);
-	}
-	if (req->flags & REQ_F_CREDS)
-		put_cred(req->creds);
-	if (req->flags & REQ_F_ASYNC_DATA) {
-		kfree(req->async_data);
-		req->async_data = NULL;
-	}
-	req->flags &= ~IO_REQ_CLEAN_FLAGS;
-}
-
 static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
 			   unsigned int issue_flags)
 {
-- 
cgit v1.2.3


From 2fdd6fb5ff958a0f6b403e3f3ffd645b60b2a2b2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:25 +0100
Subject: io_uring: don't batch task put on reqs free

We're trying to batch io_put_task() in io_free_batch_list(), but
considering that the hot path is a simple inc, it's most cerainly and
probably faster to just do io_put_task() instead of task tracking.

We don't care about io_put_task_remote() as it's only for IOPOLL
where polling/waiting is done by not the submitter task.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/4a7ef7dce845fe2bd35507bf389d6bd2d5c1edf0.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 4d8613996644..3eec5c761d0a 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -754,29 +754,29 @@ static void io_cqring_overflow_flush(struct io_ring_ctx *ctx)
 }
 
 /* can be called by any task */
-static void io_put_task_remote(struct task_struct *task, int nr)
+static void io_put_task_remote(struct task_struct *task)
 {
 	struct io_uring_task *tctx = task->io_uring;
 
-	percpu_counter_sub(&tctx->inflight, nr);
+	percpu_counter_sub(&tctx->inflight, 1);
 	if (unlikely(atomic_read(&tctx->in_cancel)))
 		wake_up(&tctx->wait);
-	put_task_struct_many(task, nr);
+	put_task_struct(task);
 }
 
 /* used by a task to put its own references */
-static void io_put_task_local(struct task_struct *task, int nr)
+static void io_put_task_local(struct task_struct *task)
 {
-	task->io_uring->cached_refs += nr;
+	task->io_uring->cached_refs++;
 }
 
 /* must to be called somewhat shortly after putting a request */
-static inline void io_put_task(struct task_struct *task, int nr)
+static inline void io_put_task(struct task_struct *task)
 {
 	if (likely(task == current))
-		io_put_task_local(task, nr);
+		io_put_task_local(task);
 	else
-		io_put_task_remote(task, nr);
+		io_put_task_remote(task);
 }
 
 void io_task_refs_refill(struct io_uring_task *tctx)
@@ -1033,7 +1033,7 @@ static void __io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
 		 * we don't hold ->completion_lock. Clean them here to avoid
 		 * deadlocks.
 		 */
-		io_put_task_remote(req->task, 1);
+		io_put_task_remote(req->task);
 		wq_list_add_head(&req->comp_list, &ctx->locked_free_list);
 		ctx->locked_free_nr++;
 	}
@@ -1518,9 +1518,6 @@ void io_queue_next(struct io_kiocb *req)
 void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 	__must_hold(&ctx->uring_lock)
 {
-	struct task_struct *task = NULL;
-	int task_refs = 0;
-
 	do {
 		struct io_kiocb *req = container_of(node, struct io_kiocb,
 						    comp_list);
@@ -1550,19 +1547,10 @@ void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node)
 
 		io_req_put_rsrc_locked(req, ctx);
 
-		if (req->task != task) {
-			if (task)
-				io_put_task(task, task_refs);
-			task = req->task;
-			task_refs = 0;
-		}
-		task_refs++;
+		io_put_task(req->task);
 		node = req->comp_list.next;
 		io_req_add_to_cache(req, ctx);
 	} while (node);
-
-	if (task)
-		io_put_task(task, task_refs);
 }
 
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
-- 
cgit v1.2.3


From 91c7884ac9a92ffbf78af7fc89603daf24f448a9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:26 +0100
Subject: io_uring: remove IOU_F_TWQ_FORCE_NORMAL

Extract a function for non-local task_work_add, and use it directly from
io_move_task_work_from_local(). Now we don't use IOU_F_TWQ_FORCE_NORMAL
and it can be killed.

As a small positive side effect we don't grab task->io_uring in
io_req_normal_work_add anymore, which is not needed for
io_req_local_work_add().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2e55571e8ff2927ae3cc12da606d204e2485525b.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 25 ++++++++++++++-----------
 io_uring/io_uring.h |  5 +----
 2 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3eec5c761d0a..776d1aa73d26 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1317,7 +1317,7 @@ static __cold void io_fallback_tw(struct io_uring_task *tctx)
 	}
 }
 
-static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
+static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned nr_wait, nr_tw, nr_tw_prev;
@@ -1368,19 +1368,11 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 	wake_up_state(ctx->submitter_task, TASK_INTERRUPTIBLE);
 }
 
-void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
+static void io_req_normal_work_add(struct io_kiocb *req)
 {
 	struct io_uring_task *tctx = req->task->io_uring;
 	struct io_ring_ctx *ctx = req->ctx;
 
-	if (!(flags & IOU_F_TWQ_FORCE_NORMAL) &&
-	    (ctx->flags & IORING_SETUP_DEFER_TASKRUN)) {
-		rcu_read_lock();
-		io_req_local_work_add(req, flags);
-		rcu_read_unlock();
-		return;
-	}
-
 	/* task_work already pending, we're done */
 	if (!llist_add(&req->io_task_work.node, &tctx->task_list))
 		return;
@@ -1394,6 +1386,17 @@ void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
 	io_fallback_tw(tctx);
 }
 
+void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
+{
+	if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) {
+		rcu_read_lock();
+		io_req_local_work_add(req, flags);
+		rcu_read_unlock();
+	} else {
+		io_req_normal_work_add(req);
+	}
+}
+
 static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 {
 	struct llist_node *node;
@@ -1404,7 +1407,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
 						    io_task_work.node);
 
 		node = node->next;
-		__io_req_task_work_add(req, IOU_F_TWQ_FORCE_NORMAL);
+		io_req_normal_work_add(req);
 	}
 }
 
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 9718897133db..20ba6df49b1f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -16,9 +16,6 @@
 #endif
 
 enum {
-	/* don't use deferred task_work */
-	IOU_F_TWQ_FORCE_NORMAL			= 1,
-
 	/*
 	 * A hint to not wake right away but delay until there are enough of
 	 * tw's queued to match the number of CQEs the task is waiting for.
@@ -26,7 +23,7 @@ enum {
 	 * Must not be used wirh requests generating more than one CQE.
 	 * It's also ignored unless IORING_SETUP_DEFER_TASKRUN is set.
 	 */
-	IOU_F_TWQ_LAZY_WAKE			= 2,
+	IOU_F_TWQ_LAZY_WAKE			= 1,
 };
 
 enum {
-- 
cgit v1.2.3


From f432b76bcc93f36edb3d371f7b8d7881261dd6e7 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:27 +0100
Subject: io_uring: kill io_cq_unlock()

We're abusing ->completion_lock helpers. io_cq_unlock() neither
locking conditionally nor doing CQE flushing, which means that callers
must have some side reason of taking the lock and should do it directly.

Open code io_cq_unlock() into io_cqring_overflow_kill() and clean it up.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/7dabb36856db2b562e78780480396c52c29b2bf4.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 776d1aa73d26..2f55abb676c0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -644,12 +644,6 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 	spin_lock(&ctx->completion_lock);
 }
 
-static inline void io_cq_unlock(struct io_ring_ctx *ctx)
-	__releases(ctx->completion_lock)
-{
-	spin_unlock(&ctx->completion_lock);
-}
-
 /* keep it inlined for io_submit_flush_completions() */
 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
@@ -694,10 +688,10 @@ static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
 	struct io_overflow_cqe *ocqe;
 	LIST_HEAD(list);
 
-	io_cq_lock(ctx);
+	spin_lock(&ctx->completion_lock);
 	list_splice_init(&ctx->cq_overflow_list, &list);
 	clear_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq);
-	io_cq_unlock(ctx);
+	spin_unlock(&ctx->completion_lock);
 
 	while (!list_empty(&list)) {
 		ocqe = list_first_entry(&list, struct io_overflow_cqe, list);
-- 
cgit v1.2.3


From 55b6a69fed5df6f88ef0b2ace562b422162beb61 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:28 +0100
Subject: io_uring: fix acquire/release annotations

We do conditional locking, so __io_cq_lock() and friends not always
actually grab/release the lock, so kill misleading annotations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/2a098f9144c24cab622f8bf90b39f44da5d0401e.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 2f55abb676c0..8cb0f60d2885 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -626,7 +626,6 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
 }
 
 static inline void __io_cq_lock(struct io_ring_ctx *ctx)
-	__acquires(ctx->completion_lock)
 {
 	if (!ctx->task_complete)
 		spin_lock(&ctx->completion_lock);
@@ -646,7 +645,6 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 
 /* keep it inlined for io_submit_flush_completions() */
 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
-	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
 	__io_cq_unlock(ctx);
@@ -655,7 +653,6 @@ static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 }
 
 static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
-	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
 
-- 
cgit v1.2.3


From ff12617728fa5c7fb5325e164503ca4e936b80bd Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:29 +0100
Subject: io_uring: inline __io_cq_unlock

__io_cq_unlock is not very helpful, and users should be calling flush
variants anyway. Open code the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/d875c4cfb69f38ccecb58a57111446c77a614caa.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8cb0f60d2885..39d83b631107 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -631,12 +631,6 @@ static inline void __io_cq_lock(struct io_ring_ctx *ctx)
 		spin_lock(&ctx->completion_lock);
 }
 
-static inline void __io_cq_unlock(struct io_ring_ctx *ctx)
-{
-	if (!ctx->task_complete)
-		spin_unlock(&ctx->completion_lock);
-}
-
 static inline void io_cq_lock(struct io_ring_ctx *ctx)
 	__acquires(ctx->completion_lock)
 {
@@ -647,7 +641,9 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
-	__io_cq_unlock(ctx);
+	if (!ctx->task_complete)
+		spin_unlock(&ctx->completion_lock);
+
 	io_commit_cqring_flush(ctx);
 	io_cqring_wake(ctx);
 }
@@ -664,7 +660,7 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
 		 */
 		io_commit_cqring_flush(ctx);
 	} else {
-		__io_cq_unlock(ctx);
+		spin_unlock(&ctx->completion_lock);
 		io_commit_cqring_flush(ctx);
 		io_cqring_wake(ctx);
 	}
-- 
cgit v1.2.3


From 0fdb9a196c6728b51e0e7a4f6fa292d9fd5793de Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:30 +0100
Subject: io_uring: make io_cq_unlock_post static

io_cq_unlock_post() is exclusively used in io_uring/io_uring.c, mark it
static and don't expose to other files.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/3dc8127dda4514e1dd24bb32035faac887c5fa37.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 2 +-
 io_uring/io_uring.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 39d83b631107..70fffed83e95 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -666,7 +666,7 @@ static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
 	}
 }
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx)
+static void io_cq_unlock_post(struct io_ring_ctx *ctx)
 	__releases(ctx->completion_lock)
 {
 	io_commit_cqring(ctx);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 20ba6df49b1f..d3606d30cf6f 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -110,8 +110,6 @@ static inline void io_req_task_work_add(struct io_kiocb *req)
 #define io_for_each_link(pos, head) \
 	for (pos = (head); pos; pos = pos->link)
 
-void io_cq_unlock_post(struct io_ring_ctx *ctx);
-
 static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
 						       bool overflow)
 {
-- 
cgit v1.2.3


From c98c81a4ac37b651be7eb9d16f562fc4acc5f867 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 23 Jun 2023 12:23:31 +0100
Subject: io_uring: merge conditional unlock flush helpers

There is no reason not to use __io_cq_unlock_post_flush for intermediate
aux CQE flushing, all ->task_complete should apply there, i.e. if set it
should be the submitter task. Combine them, get rid of of
__io_cq_unlock_post() and rename the left function.

This place was also taking a couple percents of CPU according to
profiles for max throughput net benchmarks due to multishot recv
flooding it with completions.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/bbed60734cbec2e833d9c7bdcf9741aada5d8aab.1687518903.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 70fffed83e95..1b53a2ab0a27 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -637,18 +637,7 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx)
 	spin_lock(&ctx->completion_lock);
 }
 
-/* keep it inlined for io_submit_flush_completions() */
 static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
-{
-	io_commit_cqring(ctx);
-	if (!ctx->task_complete)
-		spin_unlock(&ctx->completion_lock);
-
-	io_commit_cqring_flush(ctx);
-	io_cqring_wake(ctx);
-}
-
-static void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx)
 {
 	io_commit_cqring(ctx);
 
@@ -1568,7 +1557,7 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx)
 			}
 		}
 	}
-	__io_cq_unlock_post_flush(ctx);
+	__io_cq_unlock_post(ctx);
 
 	if (!wq_list_empty(&ctx->submit_state.compl_reqs)) {
 		io_free_batch_list(ctx, state->compl_reqs.first);
-- 
cgit v1.2.3


From 10e1c0d59006c6492d380602aa0a6c4eb9441426 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 27 Jun 2023 11:57:53 -0600
Subject: io_uring: remove io_fallback_tw() forward declaration

It's used just one function higher up, get rid of the declaration and
just move it up a bit.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1b53a2ab0a27..f84d258ea348 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -149,7 +149,6 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
 static void io_queue_sqe(struct io_kiocb *req);
 static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
 static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
-static __cold void io_fallback_tw(struct io_uring_task *tctx);
 
 struct kmem_cache *req_cachep;
 
@@ -1238,6 +1237,20 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
 	return cmpxchg(&head->first, old, new);
 }
 
+static __cold void io_fallback_tw(struct io_uring_task *tctx)
+{
+	struct llist_node *node = llist_del_all(&tctx->task_list);
+	struct io_kiocb *req;
+
+	while (node) {
+		req = container_of(node, struct io_kiocb, io_task_work.node);
+		node = node->next;
+		if (llist_add(&req->io_task_work.node,
+			      &req->ctx->fallback_llist))
+			schedule_delayed_work(&req->ctx->fallback_work, 1);
+	}
+}
+
 void tctx_task_work(struct callback_head *cb)
 {
 	struct io_tw_state ts = {};
@@ -1279,20 +1292,6 @@ void tctx_task_work(struct callback_head *cb)
 	trace_io_uring_task_work_run(tctx, count, loops);
 }
 
-static __cold void io_fallback_tw(struct io_uring_task *tctx)
-{
-	struct llist_node *node = llist_del_all(&tctx->task_list);
-	struct io_kiocb *req;
-
-	while (node) {
-		req = container_of(node, struct io_kiocb, io_task_work.node);
-		node = node->next;
-		if (llist_add(&req->io_task_work.node,
-			      &req->ctx->fallback_llist))
-			schedule_delayed_work(&req->ctx->fallback_work, 1);
-	}
-}
-
 static inline void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-- 
cgit v1.2.3


From dfbe5561ae9339516a3742a3fbd678609ad59fd0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 28 Jun 2023 11:06:05 -0600
Subject: io_uring: flush offloaded and delayed task_work on exit

io_uring offloads task_work for cancelation purposes when the task is
exiting. This is conceptually fine, but we should be nicer and actually
wait for that work to complete before returning.

Add an argument to io_fallback_tw() telling it to flush the deferred
work when it's all queued up, and have it flush a ctx behind whenever
the ctx changes.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index f84d258ea348..e8096d502a7c 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1237,18 +1237,32 @@ static inline struct llist_node *io_llist_cmpxchg(struct llist_head *head,
 	return cmpxchg(&head->first, old, new);
 }
 
-static __cold void io_fallback_tw(struct io_uring_task *tctx)
+static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
 {
 	struct llist_node *node = llist_del_all(&tctx->task_list);
+	struct io_ring_ctx *last_ctx = NULL;
 	struct io_kiocb *req;
 
 	while (node) {
 		req = container_of(node, struct io_kiocb, io_task_work.node);
 		node = node->next;
+		if (sync && last_ctx != req->ctx) {
+			if (last_ctx) {
+				flush_delayed_work(&last_ctx->fallback_work);
+				percpu_ref_put(&last_ctx->refs);
+			}
+			last_ctx = req->ctx;
+			percpu_ref_get(&last_ctx->refs);
+		}
 		if (llist_add(&req->io_task_work.node,
 			      &req->ctx->fallback_llist))
 			schedule_delayed_work(&req->ctx->fallback_work, 1);
 	}
+
+	if (last_ctx) {
+		flush_delayed_work(&last_ctx->fallback_work);
+		percpu_ref_put(&last_ctx->refs);
+	}
 }
 
 void tctx_task_work(struct callback_head *cb)
@@ -1263,7 +1277,7 @@ void tctx_task_work(struct callback_head *cb)
 	unsigned int count = 0;
 
 	if (unlikely(current->flags & PF_EXITING)) {
-		io_fallback_tw(tctx);
+		io_fallback_tw(tctx, true);
 		return;
 	}
 
@@ -1358,7 +1372,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
 	if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method)))
 		return;
 
-	io_fallback_tw(tctx);
+	io_fallback_tw(tctx, false);
 }
 
 void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
@@ -3108,6 +3122,8 @@ static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	if (ctx->rings)
 		io_kill_timeouts(ctx, NULL, true);
 
+	flush_delayed_work(&ctx->fallback_work);
+
 	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 	/*
 	 * Use system_unbound_wq to avoid spawning tons of event kworkers
-- 
cgit v1.2.3


From 8a796565cec3601071cbbd27d6304e202019d014 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 7 Jul 2023 09:20:07 -0700
Subject: io_uring: Use io_schedule* in cqring wait

I observed poor performance of io_uring compared to synchronous IO. That
turns out to be caused by deeper CPU idle states entered with io_uring,
due to io_uring using plain schedule(), whereas synchronous IO uses
io_schedule().

The losses due to this are substantial. On my cascade lake workstation,
t/io_uring from the fio repository e.g. yields regressions between 20%
and 40% with the following command:
./t/io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S$use_sync -R 0 /mnt/t2/fio/write.0.0

This is repeatable with different filesystems, using raw block devices
and using different block devices.

Use io_schedule_prepare() / io_schedule_finish() in
io_cqring_wait_schedule() to address the difference.

After that using io_uring is on par or surpassing synchronous IO (using
registered files etc makes it reliably win, but arguably is a less fair
comparison).

There are other calls to schedule() in io_uring/, but none immediately
jump out to be similarly situated, so I did not touch them. Similarly,
it's possible that mutex_lock_io() should be used, but it's not clear if
there are cases where that matters.

Cc: stable@vger.kernel.org # 5.10+
Cc: Pavel Begunkov <asml.silence@gmail.com>
Cc: io-uring@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Andres Freund <andres@anarazel.de>
Link: https://lore.kernel.org/r/20230707162007.194068-1-andres@anarazel.de
[axboe: minor style fixup]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 io_uring/io_uring.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'io_uring/io_uring.c')

diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index e8096d502a7c..7505de2428e0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2489,6 +2489,8 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx)
 static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 					  struct io_wait_queue *iowq)
 {
+	int token, ret;
+
 	if (unlikely(READ_ONCE(ctx->check_cq)))
 		return 1;
 	if (unlikely(!llist_empty(&ctx->work_llist)))
@@ -2499,11 +2501,20 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
 		return -EINTR;
 	if (unlikely(io_should_wake(iowq)))
 		return 0;
+
+	/*
+	 * Use io_schedule_prepare/finish, so cpufreq can take into account
+	 * that the task is waiting for IO - turns out to be important for low
+	 * QD IO.
+	 */
+	token = io_schedule_prepare();
+	ret = 0;
 	if (iowq->timeout == KTIME_MAX)
 		schedule();
 	else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS))
-		return -ETIME;
-	return 0;
+		ret = -ETIME;
+	io_schedule_finish(token);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3