diff options
Diffstat (limited to 'io_uring')
-rw-r--r-- | io_uring/io_uring.c | 3 | ||||
-rw-r--r-- | io_uring/kbuf.c | 1 | ||||
-rw-r--r-- | io_uring/kbuf.h | 1 | ||||
-rw-r--r-- | io_uring/msg_ring.c | 4 | ||||
-rw-r--r-- | io_uring/net.c | 46 | ||||
-rw-r--r-- | io_uring/opdef.c | 1 | ||||
-rw-r--r-- | io_uring/poll.c | 2 | ||||
-rw-r--r-- | io_uring/rsrc.c | 57 | ||||
-rw-r--r-- | io_uring/rsrc.h | 3 | ||||
-rw-r--r-- | io_uring/zcrx.c | 104 | ||||
-rw-r--r-- | io_uring/zcrx.h | 11 |
11 files changed, 146 insertions, 87 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 74218c7b7604..211f7f5f507e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1647,11 +1647,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) io_req_flags_t io_file_get_flags(struct file *file) { + struct inode *inode = file_inode(file); io_req_flags_t res = 0; BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); - if (S_ISREG(file_inode(file)->i_mode)) + if (S_ISREG(inode->i_mode) && !(inode->i_flags & S_ANON_INODE)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) res |= REQ_F_SUPPORT_NOWAIT; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index a8467f6aba54..6c676aedc9a2 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -271,6 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, if (len > arg->max_len) { len = arg->max_len; if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; if (iov != arg->iovs) break; buf->len = len; diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 2ec0b983ce24..6eb16ba6ed39 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -55,6 +55,7 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 50a958e9c921..c73eca5943c0 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) spin_unlock(&ctx->msg_lock); } if (req) - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { if (!READ_ONCE(ctx->submitter_task)) { - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); return -EOWNERDEAD; } req->opcode = IORING_OP_NOP; diff --git a/io_uring/net.c b/io_uring/net.c index 3feceb2b5b97..4dac01ff7ac7 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -76,12 +76,17 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - bool retry; + unsigned short retry_flags; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +enum sr_retry_flags { + IO_SR_MSG_RETRY = 1, + IO_SR_MSG_PARTIAL_MAP = 2, +}; + /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from @@ -188,7 +193,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = 0; /* get from the provided buffer */ req->buf_index = sr->buf_group; } @@ -401,7 +406,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -759,7 +764,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; + sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -831,7 +836,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); - if (sr->retry) + if (sr->retry_flags & IO_SR_MSG_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) @@ -840,12 +845,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 && + if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; sr->done_io += this_ret; - sr->retry = true; + sr->retry_flags |= IO_SR_MSG_RETRY; return false; } } else { @@ -1084,6 +1089,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + if (arg.partial_map) + sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1092,11 +1105,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { - kmsg->vec.nr = ret; - kmsg->vec.iovec = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; @@ -1284,7 +1292,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; zc->done_io = 0; - zc->retry = false; + zc->retry_flags = 0; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1741,9 +1749,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (unlikely(req->flags & REQ_F_FAIL)) { - ret = -ECONNRESET; - goto out; + if (connect->in_progress) { + struct poll_table_struct pt = { ._key = EPOLLERR }; + + if (vfs_poll(req->file, &pt) & EPOLLERR) + goto get_sock_err; } file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1768,8 +1778,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) * which means the previous result is good. For both of these, * grab the sock_error() and use that for the completion. */ - if (ret == -EBADFD || ret == -EISCONN) + if (ret == -EBADFD || ret == -EISCONN) { +get_sock_err: ret = sock_error(sock_from_file(req->file)->sk); + } } if (ret == -ERESTARTSYS) ret = -EINTR; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 489384c0438b..78ef5976bf00 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, diff --git a/io_uring/poll.c b/io_uring/poll.c index 8eb744eb9f4c..ddafa88b9fbe 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -273,8 +273,6 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) return IOU_POLL_REISSUE; } } - if (unlikely(req->cqe.res & EPOLLERR)) - req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 794d4ae6f0bc..7409af671c9e 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } static void io_release_ubuf(void *priv) @@ -109,8 +112,11 @@ static void io_release_ubuf(void *priv) struct io_mapped_ubuf *imu = priv; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } } static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, @@ -732,6 +738,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have @@ -825,7 +832,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + node->buf = imu; ret = 0; @@ -841,8 +852,10 @@ done: if (ret) { if (imu) io_free_imu(ctx, imu); - if (pages) - unpin_user_pages(pages, nr_pages); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } @@ -1326,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, { unsigned long folio_size = 1 << imu->folio_shift; unsigned long folio_mask = folio_size - 1; - u64 folio_addr = imu->ubuf & ~folio_mask; struct bio_vec *res_bvec = vec->bvec; size_t total_len = 0; unsigned bvec_idx = 0; @@ -1348,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW; - /* by using folio address it also accounts for bvec offset */ - offset = buf_addr - folio_addr; + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + src_bvec = imu->bvec + (offset >> imu->folio_shift); offset &= folio_mask; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index b52242852ff3..ba00ee6f0650 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -49,6 +49,7 @@ struct io_imu_folio_data { unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; + unsigned long first_folio_page_idx; }; bool io_rsrc_cache_init(struct io_ring_ctx *ctx); @@ -83,7 +84,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -int io_buffer_validate(struct iovec *iov); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f30..adb1e426987e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,12 +26,61 @@ #include "zcrx.h" #include "rsrc.h" +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) { return pp->mp_priv; } -#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->mem.pages[net_iov_idx(niov)]; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages; + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (!area_reg->addr) + return -EFAULT; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return 0; +} static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area, int nr_mapped) @@ -70,8 +119,8 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; - dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); + dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, + PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); if (dma_mapping_error(ifq->dev, dma)) break; if (net_mp_niov_set_dma_addr(niov, dma)) { @@ -118,13 +167,6 @@ struct io_zcrx_args { static const struct memory_provider_ops io_uring_pp_zc_ops; -static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) -{ - struct net_iov_area *owner = net_iov_owner(niov); - - return container_of(owner, struct io_zcrx_area, nia); -} - static inline atomic_t *io_get_user_counter(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); @@ -147,13 +189,6 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - return area->pages[net_iov_idx(niov)]; -} - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd) @@ -187,15 +222,13 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { - io_zcrx_unmap_area(area->ifq, area); + if (area->ifq) + io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); - if (area->pages) { - unpin_user_pages(area->pages, area->nr_folios); - kvfree(area->pages); - } kfree(area); } @@ -204,37 +237,27 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages, nr_iovs; - struct iovec iov; + unsigned nr_iovs; + int i, ret; if (area_reg->flags || area_reg->rq_area_token) return -EINVAL; if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) - return -EINVAL; - - iov.iov_base = u64_to_user_ptr(area_reg->addr); - iov.iov_len = area_reg->len; - ret = io_buffer_validate(&iov); - if (ret) - return ret; ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; - area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, - &nr_pages); - if (IS_ERR(area->pages)) { - ret = PTR_ERR(area->pages); - area->pages = NULL; + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) goto err; - } - area->nr_folios = nr_iovs = nr_pages; + + nr_iovs = area->mem.size >> PAGE_SHIFT; area->nia.num_niovs = nr_iovs; + ret = -ENOMEM; area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) @@ -653,10 +676,7 @@ static int io_pp_zc_init(struct page_pool *pp) static void io_pp_zc_destroy(struct page_pool *pp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - struct io_zcrx_area *area = ifq->area; - if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) - return; percpu_ref_put(&ifq->ctx->refs); } diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f2bc811f022c..64796c90851e 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -7,6 +7,13 @@ #include <net/page_pool/types.h> #include <net/net_trackers.h> +struct io_zcrx_mem { + unsigned long size; + + struct page **pages; + unsigned long nr_folios; +}; + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -14,13 +21,13 @@ struct io_zcrx_area { bool is_mapped; u16 area_id; - struct page **pages; - unsigned long nr_folios; /* freelist */ spinlock_t freelist_lock ____cacheline_aligned_in_smp; u32 free_count; u32 *freelist; + + struct io_zcrx_mem mem; }; struct io_zcrx_ifq { |