diff options
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r-- | io_uring/rsrc.c | 200 |
1 files changed, 110 insertions, 90 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 5e64a8bb30a4..c592ceace97d 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } static void io_release_ubuf(void *priv) @@ -175,6 +178,18 @@ void io_rsrc_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->imu_cache, kfree); } +static void io_clear_table_tags(struct io_rsrc_data *data) +{ + int i; + + for (i = 0; i < data->nr; i++) { + struct io_rsrc_node *node = data->nodes[i]; + + if (node) + node->tag = 0; + } +} + __cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data) { @@ -485,7 +500,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -583,6 +598,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: + io_clear_table_tags(&ctx->file_table.data); io_sqe_files_unregister(ctx); return ret; } @@ -672,38 +688,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; - int nr_folios = data->nr_folios; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; @@ -902,8 +914,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, } ctx->buf_table = data; - if (ret) + if (ret) { + io_clear_table_tags(&ctx->buf_table); io_sqe_buffers_unregister(ctx); + } return ret; } @@ -1017,71 +1031,71 @@ static int validate_fixed_range(u64 buf_addr, size_t len, return 0; } +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; + + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); + + if (count < imu->len) { + const struct bio_vec *bvec = iter->bvec; + + while (len > bvec->bv_len) { + len -= bvec->bv_len; + bvec++; + } + iter->nr_segs = 1 + bvec - iter->bvec; + } + return 0; +} + static int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, u64 buf_addr, size_t len) { + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; size_t offset; int ret; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; if (!(imu->dir & (1 << ddir))) return -EFAULT; - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here for user - * registered nodes, because we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; - /* - * Kernel buffer bvecs, on the other hand, don't necessarily - * have the size property of user registered ones, so we have - * to use the slow iter advance. - */ - if (offset < bvec->bv_len) { - iter->count -= offset; - iter->iov_offset = offset; - } else if (imu->is_kbuf) { - iov_iter_advance(iter, offset); - } else { - unsigned long seg_skip; + if (imu->is_kbuf) + return io_import_kbuf(ddir, iter, imu, len, offset); - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; - iter->bvec += seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); - } + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; } - + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; return 0; } @@ -1093,13 +1107,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, if (req->flags & REQ_F_BUF_NODE) return req->buf_node; + req->flags |= REQ_F_BUF_NODE; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) - io_req_assign_buf_node(req, node); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; io_ring_submit_unlock(ctx, issue_flags); - return node; + return NULL; } int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, |