diff options
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r-- | io_uring/rsrc.c | 1354 |
1 files changed, 838 insertions, 516 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6f3b6de230bd..f2b31fb68992 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,11 +9,11 @@ #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" -#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" @@ -26,20 +26,14 @@ struct io_rsrc_update { u32 offset; }; -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage); +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static const struct io_mapped_ubuf dummy_ubuf = { - /* set invalid range, so io_import_fixed() fails meeting it */ - .ubuf = -1UL, - .len = UINT_MAX, -}; +#define IO_CACHED_BVECS_SEGS 32 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { @@ -86,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -97,235 +102,120 @@ static int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +static void io_release_ubuf(void *priv) { - struct io_mapped_ubuf *imu = *slot; + struct io_mapped_ubuf *imu = priv; unsigned int i; - *slot = NULL; - if (imu != &dummy_ubuf) { - if (!refcount_dec_and_test(&imu->refs)) - return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } -} + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); -static void io_rsrc_put_work(struct io_rsrc_node *node) -{ - struct io_rsrc_put *prsrc = &node->item; - - if (prsrc->tag) - io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - - switch (node->type) { - case IORING_RSRC_FILE: - fput(prsrc->file); - break; - case IORING_RSRC_BUFFER: - io_rsrc_buf_put(node->ctx, prsrc); - break; - default: - WARN_ON_ONCE(1); - break; + unpin_user_folio(folio, 1); } } -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) - kfree(node); + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), + GFP_KERNEL); } -void io_rsrc_node_ref_zero(struct io_rsrc_node *node) - __must_hold(&node->ctx->uring_lock) +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - struct io_ring_ctx *ctx = node->ctx; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (node->refs) - break; - list_del(&node->node); - - if (likely(!node->empty)) - io_rsrc_put_work(node); - io_rsrc_node_destroy(ctx, node); - } - if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) - wake_up_all(&ctx->rsrc_quiesce_wq); + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) + io_cache_free(&ctx->imu_cache, imu); + else + kvfree(imu); } -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - struct io_rsrc_node *ref_node; - - ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (!ref_node) { - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - } + if (!refcount_dec_and_test(&imu->refs)) + return; - ref_node->ctx = ctx; - ref_node->empty = 0; - ref_node->refs = 1; - return ref_node; + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); + io_free_imu(ctx, imu); } -__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { - struct io_rsrc_node *backup; - DEFINE_WAIT(we); - int ret; - - /* As We may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - backup = io_rsrc_node_alloc(ctx); - if (!backup) - return -ENOMEM; - ctx->rsrc_node->empty = true; - ctx->rsrc_node->type = -1; - list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, ctx->rsrc_node); - ctx->rsrc_node = backup; - - if (list_empty(&ctx->rsrc_ref_list)) - return 0; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); + struct io_rsrc_node *node; + + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); + if (node) { + node->type = type; + node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } - - ctx->rsrc_quiesce++; - data->quiesce = true; - do { - prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); - mutex_unlock(&ctx->uring_lock); - - ret = io_run_task_work_sig(ctx); - if (ret < 0) { - finish_wait(&ctx->rsrc_quiesce_wq, &we); - mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->rsrc_ref_list)) - ret = 0; - break; - } - - schedule(); - mutex_lock(&ctx->uring_lock); - ret = 0; - } while (!list_empty(&ctx->rsrc_ref_list)); - - finish_wait(&ctx->rsrc_quiesce_wq, &we); - data->quiesce = false; - ctx->rsrc_quiesce--; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 0); - smp_mb(); - } - return ret; + return node; } -static void io_free_page_table(void **table, size_t size) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; } -static void io_rsrc_data_free(struct io_rsrc_data *data) +void io_rsrc_cache_free(struct io_ring_ctx *ctx) { - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kfree); } -static __cold void **io_alloc_page_table(size_t size) +static void io_clear_table_tags(struct io_rsrc_data *data) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; + int i; - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + for (i = 0; i < data->nr; i++) { + struct io_rsrc_node *node = data->nodes[i]; - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; + if (node) + node->tag = 0; } - return table; } -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, - u64 __user *utags, - unsigned nr, struct io_rsrc_data **pdata) +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { - struct io_rsrc_data *data; - int ret = 0; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; + if (!data->nr) + return; + while (data->nr--) { + if (data->nodes[data->nr]) + io_put_rsrc_node(ctx, data->nodes[data->nr]); } + kvfree(data->nodes); + data->nodes = NULL; + data->nr = 0; +} - data->nr = nr; - data->ctx = ctx; - data->rsrc_type = type; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } +__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) +{ + data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (data->nodes) { + data->nr = nr; + return 0; } - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; + return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -334,14 +224,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) + if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { @@ -359,19 +247,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (fd == IORING_REGISTER_FILES_SKIP) continue; - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - err = io_queue_rsrc_removal(data, i, - io_slot_file(file_slot)); - if (err) - break; - file_slot->file_ptr = 0; + i = up->offset + done; + if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); - } + if (fd != -1) { struct file *file = fget(fd); + struct io_rsrc_node *node; if (!file) { err = -EBADF; @@ -385,8 +267,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + err = -ENOMEM; + fput(file); + break; + } + ctx->file_table.data.nodes[i] = node; + if (tag) + node->tag = tag; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } @@ -405,13 +295,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, __u32 done; int i, err; - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) + if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; u64 tag = 0; uvec = u64_to_user_ptr(user_data); @@ -427,27 +317,21 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(iov); if (err) break; - if (!iov->iov_base && tag) { - err = -EINVAL; + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + err = PTR_ERR(node); break; } - err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != &dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); + if (tag) { + if (!node) { + err = -EINVAL; break; } - ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; + node->tag = tag; } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, i) = tag; + i = array_index_nospec(up->offset + done, ctx->buf_table.nr); + io_reset_rsrc_node(ctx, &ctx->buf_table, i); + ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); else @@ -563,7 +447,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, struct file *file; int ret, fd; - if (!req->ctx->file_data) + if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { @@ -619,68 +503,37 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = data->ctx; - struct io_rsrc_node *node = ctx->rsrc_node; - u64 *tag_slot = io_get_tag_slot(data, idx); - - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - if (unlikely(!ctx->rsrc_node)) { - ctx->rsrc_node = node; - return -ENOMEM; - } - - node->item.rsrc = rsrc; - node->type = data->rsrc_type; - node->item.tag = *tag_slot; - *tag_slot = 0; - list_add_tail(&node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, node); - return 0; -} + if (node->tag) + io_post_aux_cqe(ctx, node->tag, 0, 0); -void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); - - if (!file) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); + switch (node->type) { + case IORING_RSRC_FILE: + fput(io_slot_file(node)); + break; + case IORING_RSRC_BUFFER: + io_buffer_unmap(ctx, node->buf); + break; + default: + WARN_ON_ONCE(1); + break; } - io_free_file_tables(&ctx->file_table); - io_file_table_set_alloc_range(ctx, 0, 0); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; + io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; + io_free_file_tables(ctx, &ctx->file_table); + io_file_table_set_alloc_range(ctx, 0, 0); + return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -691,7 +544,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int fd, ret; unsigned i; - if (ctx->file_data) + if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; @@ -699,28 +552,22 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; + if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; - } - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; + ret = -EFAULT; + if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) + goto fail; + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; - } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + if (tag) goto fail; continue; } @@ -737,56 +584,34 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto fail; } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); + ret = -ENOMEM; + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + fput(file); + goto fail; + } + if (tag) + node->tag = tag; + ctx->file_table.data.nodes[i] = node; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ - io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); + io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: - __io_sqe_files_unregister(ctx); + io_clear_table_tags(&ctx->file_table.data); + io_sqe_files_unregister(ctx); return ret; } -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; + io_rsrc_data_free(ctx, &ctx->buf_table); + return 0; } /* @@ -812,9 +637,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, } /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + for (i = 0; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + struct io_mapped_ubuf *imu; + if (!node) + continue; + imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; @@ -858,68 +687,60 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data, int nr_folios) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; return true; } -static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data) +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) { - struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; - if (*nr_pages <= 1) - return false; - data->nr_pages_mid = folio_nr_pages(folio); - if (data->nr_pages_mid == 1) - return false; - data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); + /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ - for (i = 1; i < *nr_pages; i++) { + for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; @@ -947,24 +768,29 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, if (nr_folios == 1) data->nr_pages_head = count; - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); + data->nr_folios = nr_folios; + return true; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, + struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; + struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; - bool coalesced; + bool coalesced = false; - *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) - return 0; + return NULL; + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) + return ERR_PTR(-ENOMEM); ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -976,29 +802,38 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); - *pimu = imu; + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + + node->buf = imu; ret = 0; for (i = 0; i < nr_pages; i++) { @@ -1010,46 +845,46 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } done: - if (ret) - kvfree(imu); + if (ret) { + if (imu) + io_free_imu(ctx, imu); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } + io_cache_free(&ctx->node_cache, node); + node = ERR_PTR(ret); + } kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; + return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; - struct io_rsrc_data *data; + struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - if (ctx->user_bufs) + if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); + ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } if (!arg) memset(iov, 0, sizeof(*iov)); - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; + if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); @@ -1066,142 +901,366 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, arg += sizeof(struct iovec); } - if (!iov->iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; + if (tags) { + if (copy_from_user(&tag, &tags[i], sizeof(tag))) { + ret = -EFAULT; + break; + } } - ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + ret = PTR_ERR(node); break; + } + if (tag) { + if (!node) { + ret = -EINVAL; + break; + } + node->tag = tag; + } + data.nodes[i] = node; } - WARN_ON_ONCE(ctx->buf_data); + ctx->buf_table = data; + if (ret) { + io_clear_table_tags(&ctx->buf_table); + io_sqe_buffers_unregister(ctx); + } + return ret; +} - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = io_alloc_imu(ctx, nr_bvecs); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + imu->dir = 1 << rq_data_dir(rq); + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); return ret; } +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node) { + ret = -EINVAL; + goto unlock; + } + if (!node->buf->is_kbuf) { + ret = -EBUSY; + goto unlock; + } + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + +static int validate_fixed_range(u64 buf_addr, size_t len, + const struct io_mapped_ubuf *imu) { u64 buf_end; - size_t offset; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (unlikely(len > MAX_RW_COUNT)) + return -EFAULT; + return 0; +} - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; - if (offset < bvec->bv_len) { - iter->bvec = bvec; - iter->count -= offset; - iter->iov_offset = offset; - } else { - unsigned long seg_skip; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + if (count < imu->len) { + const struct bio_vec *bvec = iter->bvec; - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); + while (len > bvec->bv_len) { + len -= bvec->bv_len; + bvec++; } + iter->nr_segs = 1 + bvec - iter->bvec; } + return 0; +} + +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; + size_t offset; + int ret; + ret = validate_fixed_range(buf_addr, len, imu); + if (unlikely(ret)) + return ret; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + offset = buf_addr - imu->ubuf; + + if (imu->is_kbuf) + return io_import_kbuf(ddir, iter, imu, len, offset); + + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; + } + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; return 0; } -static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) { - struct io_mapped_ubuf **user_bufs; - struct io_rsrc_data *data; - int i, ret, nbufs; + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + if (req->flags & REQ_F_BUF_NODE) + return req->buf_node; + req->flags |= REQ_F_BUF_NODE; + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; + io_ring_submit_unlock(ctx, issue_flags); + return NULL; +} + +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); +} + +/* Lock two rings at once. The rings must be different! */ +static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) +{ + if (ctx1 > ctx2) + swap(ctx1, ctx2); + mutex_lock(&ctx1->uring_lock); + mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); +} + +/* Both rings are locked by the caller. */ +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, + struct io_uring_clone_buffers *arg) +{ + struct io_rsrc_data data; + int i, ret, off, nr; + unsigned int nbufs; + + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert_held(&src_ctx->uring_lock); /* - * Drop our own lock here. We'll setup the data we need and reference - * the source buffers, then re-grab, check, and assign at the end. + * Accounting state is shared between the two rings; that only works if + * both rings are accounted towards the same counters. */ - mutex_unlock(&ctx->uring_lock); + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) + return -EINVAL; - mutex_lock(&src_ctx->uring_lock); - ret = -ENXIO; - nbufs = src_ctx->nr_user_bufs; - if (!nbufs) - goto out_unlock; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); - if (ret) - goto out_unlock; + /* if offsets are given, must have nr specified too */ + if (!arg->nr && (arg->dst_off || arg->src_off)) + return -EINVAL; + /* not allowed unless REPLACE is set */ + if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) + return -EBUSY; - ret = -ENOMEM; - user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); - if (!user_bufs) - goto out_free_data; + nbufs = src_ctx->buf_table.nr; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + return -EINVAL; + else if (arg->nr > IORING_MAX_REG_BUFFERS) + return -EINVAL; + if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) + return -EOVERFLOW; + if (nbufs > IORING_MAX_REG_BUFFERS) + return -EINVAL; - for (i = 0; i < nbufs; i++) { - struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; + ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); + if (ret) + return ret; - if (src != &dummy_ubuf) - refcount_inc(&src->refs); - user_bufs[i] = src; + /* Fill entries in data from dst that won't overlap with src */ + for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { + struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + + if (src_node) { + data.nodes[i] = src_node; + src_node->refs++; + } } - /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); - if (!ctx->user_bufs) { - ctx->user_bufs = user_bufs; - ctx->buf_data = data; - ctx->nr_user_bufs = nbufs; - return 0; + ret = -ENXIO; + nbufs = src_ctx->buf_table.nr; + if (!nbufs) + goto out_free; + ret = -EINVAL; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + goto out_free; + ret = -EOVERFLOW; + if (check_add_overflow(arg->nr, arg->src_off, &off)) + goto out_free; + if (off > nbufs) + goto out_free; + + off = arg->dst_off; + i = arg->src_off; + nr = arg->nr; + while (nr--) { + struct io_rsrc_node *dst_node, *src_node; + + src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); + if (!src_node) { + dst_node = NULL; + } else { + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!dst_node) { + ret = -ENOMEM; + goto out_free; + } + + refcount_inc(&src_node->buf->refs); + dst_node->buf = src_node->buf; + } + data.nodes[off++] = dst_node; + i++; } - /* someone raced setting up buffers, dump ours */ - for (i = 0; i < nbufs; i++) - io_buffer_unmap(ctx, &user_bufs[i]); - io_rsrc_data_free(data); - kfree(user_bufs); - return -EBUSY; -out_free_data: - io_rsrc_data_free(data); -out_unlock: - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); + /* + * If asked for replace, put the old table. data->nodes[] holds both + * old and new nodes at this point. + */ + if (arg->flags & IORING_REGISTER_DST_REPLACE) + io_rsrc_data_free(ctx, &ctx->buf_table); + + /* + * ctx->buf_table must be empty now - either the contents are being + * replaced and we just freed the table, or the contents are being + * copied to a ring that does not have buffers yet (checked at function + * entry). + */ + WARN_ON_ONCE(ctx->buf_table.nr); + ctx->buf_table = data; + return 0; + +out_free: + io_rsrc_data_free(ctx, &data); return ret; } @@ -1215,16 +1274,17 @@ out_unlock: int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; + struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret; - if (ctx->user_bufs || ctx->nr_user_bufs) - return -EBUSY; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; - if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) + if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; + if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) + return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; @@ -1232,8 +1292,270 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_clone_buffers(ctx, file->private_data); - if (!registered_src) - fput(file); + + src_ctx = file->private_data; + if (src_ctx != ctx) { + mutex_unlock(&ctx->uring_lock); + lock_two_rings(ctx, src_ctx); + } + + ret = io_clone_buffers(ctx, src_ctx, &buf); + + if (src_ctx != ctx) + mutex_unlock(&src_ctx->uring_lock); + + fput(file); return ret; } + +void io_vec_free(struct iou_vec *iv) +{ + if (!iv->iovec) + return; + kfree(iv->iovec); + iv->iovec = NULL; + iv->nr = 0; +} + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + int ret; + + ret = validate_fixed_range(buf_addr, iov_len, imu); + if (unlikely(ret)) + return ret; + + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + const struct bio_vec *src_bvec = imu->bvec; + struct bio_vec *res_bvec = vec->bvec; + unsigned res_idx = 0; + size_t total_len = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; + size_t iov_len = iovec[iov_idx].iov_len; + struct bvec_iter bi = { + .bi_size = offset + iov_len, + }; + struct bio_vec bv; + + bvec_iter_advance(src_bvec, &bi, offset); + for_each_mp_bvec(bv, src_bvec, bi, bi) + res_bvec[res_idx++] = bv; + total_len += iov_len; + } + iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); + return 0; +} + +static int iov_kern_bvec_size(const struct iovec *iov, + const struct io_mapped_ubuf *imu, + unsigned int *nr_seg) +{ + size_t offset = (size_t)(uintptr_t)iov->iov_base; + const struct bio_vec *bvec = imu->bvec; + int start = 0, i = 0; + size_t off = 0; + int ret; + + ret = validate_fixed_range(offset, iov->iov_len, imu); + if (unlikely(ret)) + return ret; + + for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; + off += bvec[i].bv_len, i++) { + if (offset >= off && offset < off + bvec[i].bv_len) + start = i; + } + *nr_seg = i - start; + return 0; +} + +static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu, unsigned *nr_segs) +{ + unsigned max_segs = 0; + size_t total_len = 0; + unsigned i; + int ret; + + *nr_segs = 0; + for (i = 0; i < nr_iovs; i++) { + if (unlikely(!iov[i].iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov[i].iov_len, + &total_len))) + return -EOVERFLOW; + ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); + if (unlikely(ret)) + return ret; + *nr_segs += max_segs; + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + return 0; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + unsigned iovec_off; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iovec_off = vec->nr - nr_iovs; + iov = vec->iovec + iovec_off; + + if (imu->is_kbuf) { + int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); + + if (unlikely(ret)) + return ret; + } else { + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + } + + if (sizeof(struct bio_vec) > sizeof(struct iovec)) { + size_t bvec_bytes; + + bvec_bytes = nr_segs * sizeof(struct bio_vec); + nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); + nr_segs += nr_iovs; + } + + if (nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + if (imu->is_kbuf) + return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} + +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs) +{ + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > iv->nr) { + ret = io_vec_realloc(iv, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + + /* pad iovec to the right */ + iovec_off = iv->nr - uvec_segs; + iov = iv->iovec + iovec_off; + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; +} |