diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/io-wq.c | 4 | ||||
-rw-r--r-- | io_uring/io_uring.c | 2 | ||||
-rw-r--r-- | io_uring/kbuf.c | 8 | ||||
-rw-r--r-- | io_uring/kbuf.h | 1 | ||||
-rw-r--r-- | io_uring/net.c | 58 | ||||
-rw-r--r-- | io_uring/opdef.c | 1 | ||||
-rw-r--r-- | io_uring/poll.c | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 23 | ||||
-rw-r--r-- | io_uring/rsrc.h | 1 | ||||
-rw-r--r-- | io_uring/rw.c | 2 | ||||
-rw-r--r-- | io_uring/sqpoll.c | 5 |
11 files changed, 78 insertions, 29 deletions
diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index a2d577b09930..8f555c1d7185 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -1204,8 +1204,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) + if (ret) { + put_task_struct(wq->task); goto err; + } return wq; err: diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 64870f51b678..52ada466bf98 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1681,7 +1681,7 @@ queue: spin_unlock(&ctx->completion_lock); io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { ret = -ENOMEM; io_req_defer_failed(req, ret); diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 7a8c3a004800..9bd27deeee6f 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -262,8 +262,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, /* truncate end piece, if needed, for non partial buffers */ if (len > arg->max_len) { len = arg->max_len; - if (!(bl->flags & IOBL_INC)) + if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; + if (iov != arg->iovs) + break; buf->len = len; + } } iov->iov_base = u64_to_user_ptr(buf->addr); @@ -728,7 +732,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..2586a292dfb9 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -61,6 +61,7 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/net.c b/io_uring/net.c index 384915d931b7..356f95c33aa2 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -76,12 +76,18 @@ struct io_sr_msg { /* initialised and used only by !msg send variants */ u16 addr_len; u16 buf_group; + unsigned short retry_flags; void __user *addr; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +enum sr_retry_flags { + IO_SR_MSG_RETRY = 1, + IO_SR_MSG_PARTIAL_MAP = 2, +}; + /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from @@ -203,6 +209,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; + sr->retry_flags = 0; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; } @@ -409,6 +416,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry_flags = 0; if (req->opcode == IORING_OP_SEND) { if (READ_ONCE(sqe->__pad3[0])) @@ -780,6 +788,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -828,6 +837,9 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_recvmsg_prep_setup(req); } +/* bits to clear in old and inherit in new cflags on bundle retry */ +#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) + /* * Finishes io_recv and io_recvmsg. * @@ -845,11 +857,27 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= IORING_CQE_F_SOCK_NONEMPTY; if (sr->flags & IORING_RECVSEND_BUNDLE) { - cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), + size_t this_ret = *ret - sr->done_io; + + cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); + if (sr->retry_flags & IO_SR_MSG_RETRY) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; + /* + * If more is available AND it was a full transfer, retry and + * append to this one + */ + if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && + !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += this_ret; + sr->retry_flags |= IO_SR_MSG_RETRY; + return false; + } } else { cflags |= io_put_kbuf(req, *ret, issue_flags); } @@ -1088,13 +1116,21 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 0) + if (kmsg->msg.msg_inq > 1) arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { + kmsg->free_iov_nr = ret; + kmsg->free_iov = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + if (arg.partial_map) + sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1103,11 +1139,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; @@ -1228,6 +1259,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *notif; zc->done_io = 0; + zc->retry_flags = 0; req->flags |= REQ_F_POLL_NO_LAZY; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) @@ -1703,9 +1735,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (unlikely(req->flags & REQ_F_FAIL)) { - ret = -ECONNRESET; - goto out; + if (connect->in_progress) { + struct poll_table_struct pt = { ._key = EPOLLERR }; + + if (vfs_poll(req->file, &pt) & EPOLLERR) + goto get_sock_err; } file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1730,8 +1764,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) * which means the previous result is good. For both of these, * grab the sock_error() and use that for the completion. */ - if (ret == -EBADFD || ret == -EISCONN) + if (ret == -EBADFD || ret == -EISCONN) { +get_sock_err: ret = sock_error(sock_from_file(req->file)->sk); + } } if (ret == -ERESTARTSYS) ret = -EINTR; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a2be3bbca5ff..5dc1cba158a0 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -214,6 +214,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, diff --git a/io_uring/poll.c b/io_uring/poll.c index b93e9ebdd87c..17dea8aa09c9 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -315,8 +315,6 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REISSUE; } } - if (unlikely(req->cqe.res & EPOLLERR)) - req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index a67bae350416..1687e35e21c9 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -119,8 +119,11 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo if (imu != &dummy_ubuf) { if (!refcount_dec_and_test(&imu->refs)) return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); kvfree(imu); @@ -915,6 +918,7 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, return false; data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. @@ -983,10 +987,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, goto done; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ @@ -997,7 +999,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; *pimu = imu; ret = 0; @@ -1010,8 +1014,13 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } done: - if (ret) + if (ret) { kvfree(imu); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } + } kvfree(pages); return ret; } diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8ed588036210..459cf4c6e856 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -56,6 +56,7 @@ struct io_imu_folio_data { /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; + unsigned long first_folio_page_idx; }; void io_rsrc_node_ref_zero(struct io_rsrc_node *node); diff --git a/io_uring/rw.c b/io_uring/rw.c index a1ed64760eba..3ad104cf1e7d 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -282,7 +282,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); + rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); return io_prep_rw_setup(req, ddir, do_import); } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 9a6306894895..2faa3058b2d0 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -426,7 +426,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -510,7 +509,7 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, rcu_assign_pointer(sqd->thread, tsk); mutex_unlock(&sqd->lock); - task_to_put = get_task_struct(tsk); + get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -525,8 +524,6 @@ err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); - if (task_to_put) - put_task_struct(task_to_put); return ret; } |