From 7b3188e7ed54102a5dcc73d07727f41fb528f7c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Aug 2021 19:37:41 -0600 Subject: io_uring: IORING_OP_WRITE needs hash_reg_file set During some testing, it became evident that using IORING_OP_WRITE doesn't hash buffered writes like the other writes commands do. That's simply an oversight, and can cause performance regressions when doing buffered writes with this command. Correct that and add the flag, so that buffered writes are correctly hashed when using the non-iovec based write command. Cc: stable@vger.kernel.org Fixes: 3a6820f2bb8a ("io_uring: add non-vectored read/write commands") Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 7cc458e0b636..473a977c7979 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -995,6 +995,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_WRITE] = { .needs_file = 1, + .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, -- cgit v1.2.3 From 7db304375e11741e5940f9bc549155035bfb4dc1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 21 Aug 2021 23:07:51 +0800 Subject: io_uring: retry in case of short read on block device In case of buffered reading from block device, when short read happens, we should retry to read more, otherwise the IO will be completed partially, for example, the following fio expects to read 2MB, but it can only read 1M or less bytes: fio --name=onessd --filename=/dev/nvme0n1 --filesize=2M \ --rw=randread --bs=2M --direct=0 --overwrite=0 --numjobs=1 \ --iodepth=1 --time_based=0 --runtime=2 --ioengine=io_uring \ --registerfiles --fixedbufs --gtod_reduce=1 --group_reporting Fix the issue by allowing short read retry for block device, which sets FMODE_BUF_RASYNC really. Fixes: 9a173346bd9e ("io_uring: fix short read retries for non-reg files") Cc: Pavel Begunkov Signed-off-by: Ming Lei Reviewed-by: Pavel Begunkov Link: https://lore.kernel.org/r/20210821150751.1290434-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 473a977c7979..364e77d5fe6c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -3382,6 +3382,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return -EINVAL; } +static bool need_read_all(struct io_kiocb *req) +{ + return req->flags & REQ_F_ISREG || + S_ISBLK(file_inode(req->file)->i_mode); +} + static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -3436,7 +3442,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) } else if (ret == -EIOCBQUEUED) { goto out_free; } else if (ret <= 0 || ret == io_size || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { + (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; } -- cgit v1.2.3 From c6d3d9cbd659de8f2176b4e4721149c88ac096d4 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Aug 2021 14:13:10 +0100 Subject: io_uring: fix queueing half-created requests [ 27.259845] general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] SMP KASAN PTI [ 27.261043] KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] [ 27.263730] RIP: 0010:sock_from_file+0x20/0x90 [ 27.272444] Call Trace: [ 27.272736] io_sendmsg+0x98/0x600 [ 27.279216] io_issue_sqe+0x498/0x68d0 [ 27.281142] __io_queue_sqe+0xab/0xb50 [ 27.285830] io_req_task_submit+0xbf/0x1b0 [ 27.286306] tctx_task_work+0x178/0xad0 [ 27.288211] task_work_run+0xe2/0x190 [ 27.288571] exit_to_user_mode_prepare+0x1a1/0x1b0 [ 27.289041] syscall_exit_to_user_mode+0x19/0x50 [ 27.289521] do_syscall_64+0x48/0x90 [ 27.289871] entry_SYSCALL_64_after_hwframe+0x44/0xae io_req_complete_failed() -> io_req_complete_post() -> io_req_task_queue() still would try to enqueue hard linked request, which can be half prepared (e.g. failed init), so we can't allow that to happen. Fixes: a8295b982c46d ("io_uring: fix failed linkchain code logic") Reported-by: syzbot+f9704d1878e290eddf73@syzkaller.appspotmail.com Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/70b513848c1000f88bd75965504649c6bb1415c0.1630415423.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 364e77d5fe6c..78f3d3ac2280 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1823,6 +1823,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res) io_req_complete_post(req, res, 0); } +static void io_req_complete_fail_submit(struct io_kiocb *req) +{ + /* + * We don't submit, fail them all, for that replace hardlinks with + * normal links. Extra REQ_F_LINK is tolerated. + */ + req->flags &= ~REQ_F_HARDLINK; + req->flags |= REQ_F_LINK; + io_req_complete_failed(req, req->result); +} + /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -6723,7 +6734,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); } else if (req->flags & REQ_F_FAIL) { - io_req_complete_failed(req, req->result); + io_req_complete_fail_submit(req); } else { int ret = io_req_prep_async(req); -- cgit v1.2.3 From b8ce1b9d25ccf81e1bbabd45b963ed98b2222df8 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Tue, 31 Aug 2021 14:13:11 +0100 Subject: io_uring: don't submit half-prepared drain request [ 3784.910888] BUG: kernel NULL pointer dereference, address: 0000000000000020 [ 3784.910904] RIP: 0010:__io_file_supports_nowait+0x5/0xc0 [ 3784.910926] Call Trace: [ 3784.910928] ? io_read+0x17c/0x480 [ 3784.910945] io_issue_sqe+0xcb/0x1840 [ 3784.910953] __io_queue_sqe+0x44/0x300 [ 3784.910959] io_req_task_submit+0x27/0x70 [ 3784.910962] tctx_task_work+0xeb/0x1d0 [ 3784.910966] task_work_run+0x61/0xa0 [ 3784.910968] io_run_task_work_sig+0x53/0xa0 [ 3784.910975] __x64_sys_io_uring_enter+0x22/0x30 [ 3784.910977] do_syscall_64+0x3d/0x90 [ 3784.910981] entry_SYSCALL_64_after_hwframe+0x44/0xae io_drain_req() goes before checks for REQ_F_FAIL, which protect us from submitting under-prepared request (e.g. failed in io_init_req(). Fail such drained requests as well. Fixes: a8295b982c46d ("io_uring: fix failed linkchain code logic") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/e411eb9924d47a131b1e200b26b675df0c2b7627.1630415423.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 78f3d3ac2280..4a5eb9e856f0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6238,6 +6238,11 @@ static bool io_drain_req(struct io_kiocb *req) int ret; u32 seq; + if (req->flags & REQ_F_FAIL) { + io_req_complete_fail_submit(req); + return true; + } + /* * If we need to drain a request in the middle of a link, drain the * head request and the next request/link after the current link. -- cgit v1.2.3 From fa84693b3c896460831fe0750554121121a23da8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 1 Sep 2021 14:15:59 -0600 Subject: io_uring: ensure IORING_REGISTER_IOWQ_MAX_WORKERS works with SQPOLL SQPOLL has a different thread doing submissions, we need to check for that and use the right task context when updating the worker values. Just hold the sqd->lock across the operation, this ensures that the thread cannot go away while we poke at ->io_uring. Link: https://github.com/axboe/liburing/issues/420 Fixes: 2e480058ddc2 ("io-wq: provide a way to limit max number of workers") Reported-by: Johannes Lundberg Tested-by: Johannes Lundberg Signed-off-by: Jens Axboe --- fs/io_uring.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4a5eb9e856f0..4ad0d17dc92d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -10323,26 +10323,46 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, void __user *arg) { - struct io_uring_task *tctx = current->io_uring; + struct io_uring_task *tctx = NULL; + struct io_sq_data *sqd = NULL; __u32 new_count[2]; int i, ret; - if (!tctx || !tctx->io_wq) - return -EINVAL; if (copy_from_user(new_count, arg, sizeof(new_count))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i] > INT_MAX) return -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + sqd = ctx->sq_data; + if (sqd) { + mutex_lock(&sqd->lock); + tctx = sqd->thread->io_uring; + } + } else { + tctx = current->io_uring; + } + + ret = -EINVAL; + if (!tctx || !tctx->io_wq) + goto err; + ret = io_wq_max_workers(tctx->io_wq, new_count); if (ret) - return ret; + goto err; + + if (sqd) + mutex_unlock(&sqd->lock); if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; return 0; +err: + if (sqd) + mutex_unlock(&sqd->lock); + return ret; } static bool io_register_op_must_quiesce(int op) -- cgit v1.2.3 From 636378535afb837f165beb7de3907896480cf3b2 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 2 Sep 2021 00:38:22 +0100 Subject: io_uring: don't disable kiocb_done() CQE batching Not passing issue_flags from kiocb_done() into __io_complete_rw() means that completion batching for this case is disabled, e.g. for most of buffered reads. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/b2689462835c3ee28a5999ef4f9a581e24be04a2.1630539342.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 4ad0d17dc92d..9f3f8a802abd 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2656,7 +2656,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) -- cgit v1.2.3 From 8d4ad41e3e8e4b907f088f25aee4a92f3f864027 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 2 Sep 2021 00:38:23 +0100 Subject: io_uring: prolong tctx_task_work() with flushing io_submit_flush_completions() may enqueue linked requests for task_work execution, so don't leave tctx_task_work() right after the tw list is exhausted, but try to flush and then retry. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/0755d4c2c36301447c63bdd4146c10477cea4249.1630539342.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 9f3f8a802abd..3a7145a38653 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -2102,6 +2102,9 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; + if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + spin_lock_irq(&tctx->task_lock); node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); -- cgit v1.2.3 From 31efe48eb5dc4de3e31e84b54f287e9665410ab3 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Fri, 3 Sep 2021 22:24:36 +0800 Subject: io_uring: fix possible poll event lost in multi shot mode IIUC, IORING_POLL_ADD_MULTI is similar to epoll's edge-triggered mode, that means once one pure poll request returns one event(cqe), we'll need to read or write continually until EAGAIN is returned, then I think there is a possible poll event lost race in multi shot mode: t1 poll request add | | t2 | | t3 event happens | | t4 task work add | | t5 | task work run | t6 | commit one cqe | t7 | | user app handles cqe t8 | new event happen | t9 | add back to waitqueue | t10 | After t6 but before t9, if new event happens, there'll be no wakeup operation, and if user app has picked up this cqe in t7, read or write until EAGAIN is returned. In t8, new event happens and will be lost, though this race window maybe small. To fix this possible race, add poll request back to waitqueue before committing cqe. Fixes: 88e41cf928a6 ("io_uring: add multishot mode for IORING_OP_POLL_ADD") Signed-off-by: Xiaoguang Wang Link: https://lore.kernel.org/r/20210903142436.5767-1-xiaoguang.wang@linux.alibaba.com Signed-off-by: Jens Axboe --- fs/io_uring.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/io_uring.c') diff --git a/fs/io_uring.c b/fs/io_uring.c index 3a7145a38653..30d959416eba 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -5098,7 +5098,7 @@ static void io_poll_remove_double(struct io_kiocb *req) } } -static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) +static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) __must_hold(&req->ctx->completion_lock) { struct io_ring_ctx *ctx = req->ctx; @@ -5120,10 +5120,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; - io_commit_cqring(ctx); return !(flags & IORING_CQE_F_MORE); } +static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) + __must_hold(&req->ctx->completion_lock) +{ + bool done; + + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); + return done; +} + static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5134,7 +5143,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } else { bool done; - done = io_poll_complete(req, req->result); + done = __io_poll_complete(req, req->result); if (done) { io_poll_remove_double(req); hash_del(&req->hash_node); @@ -5142,6 +5151,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); } + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); -- cgit v1.2.3