summaryrefslogtreecommitdiff
path: root/io_uring/rsrc.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/rsrc.c')
-rw-r--r--io_uring/rsrc.c138
1 files changed, 77 insertions, 61 deletions
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index cc58defd88d4..dbb94d93279b 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -118,7 +118,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
}
}
-struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
+struct io_rsrc_node *io_rsrc_node_alloc(int type)
{
struct io_rsrc_node *node;
@@ -130,6 +130,18 @@ struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type)
return node;
}
+static void io_clear_table_tags(struct io_rsrc_data *data)
+{
+ int i;
+
+ for (i = 0; i < data->nr; i++) {
+ struct io_rsrc_node *node = data->nodes[i];
+
+ if (node)
+ node->tag = 0;
+ }
+}
+
__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data)
{
if (!data->nr)
@@ -203,7 +215,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
err = -EBADF;
break;
}
- node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(IORING_RSRC_FILE);
if (!node) {
err = -ENOMEM;
fput(file);
@@ -444,8 +456,6 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags)
void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node)
{
- lockdep_assert_held(&ctx->uring_lock);
-
if (node->tag)
io_post_aux_cqe(ctx, node->tag, 0, 0);
@@ -525,7 +535,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
goto fail;
}
ret = -ENOMEM;
- node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
+ node = io_rsrc_node_alloc(IORING_RSRC_FILE);
if (!node) {
fput(file);
goto fail;
@@ -541,6 +551,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr);
return 0;
fail:
+ io_clear_table_tags(&ctx->file_table.data);
io_sqe_files_unregister(ctx);
return ret;
}
@@ -626,11 +637,12 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages,
return ret;
}
-static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
- struct io_imu_folio_data *data, int nr_folios)
+static bool io_coalesce_buffer(struct page ***pages, int *nr_pages,
+ struct io_imu_folio_data *data)
{
struct page **page_array = *pages, **new_array = NULL;
int nr_pages_left = *nr_pages, i, j;
+ int nr_folios = data->nr_folios;
/* Store head pages only*/
new_array = kvmalloc_array(nr_folios, sizeof(struct page *),
@@ -667,27 +679,21 @@ static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages,
return true;
}
-static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
- struct io_imu_folio_data *data)
+bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
+ struct io_imu_folio_data *data)
{
- struct page **page_array = *pages;
struct folio *folio = page_folio(page_array[0]);
unsigned int count = 1, nr_folios = 1;
int i;
- if (*nr_pages <= 1)
- return false;
-
data->nr_pages_mid = folio_nr_pages(folio);
- if (data->nr_pages_mid == 1)
- return false;
-
data->folio_shift = folio_shift(folio);
+
/*
* Check if pages are contiguous inside a folio, and all folios have
* the same page count except for the head and tail.
*/
- for (i = 1; i < *nr_pages; i++) {
+ for (i = 1; i < nr_pages; i++) {
if (page_folio(page_array[i]) == folio &&
page_array[i] == page_array[i-1] + 1) {
count++;
@@ -715,7 +721,8 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages,
if (nr_folios == 1)
data->nr_pages_head = count;
- return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios);
+ data->nr_folios = nr_folios;
+ return true;
}
static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
@@ -729,12 +736,12 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
size_t size;
int ret, nr_pages, i;
struct io_imu_folio_data data;
- bool coalesced;
+ bool coalesced = false;
if (!iov->iov_base)
return NULL;
- node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+ node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
if (!node)
return ERR_PTR(-ENOMEM);
node->buf = NULL;
@@ -749,7 +756,10 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
}
/* If it's huge page(s), try to coalesce them into fewer bvec entries */
- coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data);
+ if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) {
+ if (data.nr_pages_mid != 1)
+ coalesced = io_coalesce_buffer(&pages, &nr_pages, &data);
+ }
imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL);
if (!imu)
@@ -858,8 +868,10 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
}
ctx->buf_table = data;
- if (ret)
+ if (ret) {
+ io_clear_table_tags(&ctx->buf_table);
io_sqe_buffers_unregister(ctx);
+ }
return ret;
}
@@ -883,7 +895,7 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
- iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
+ iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, len);
if (offset) {
/*
@@ -905,7 +917,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
const struct bio_vec *bvec = imu->bvec;
if (offset < bvec->bv_len) {
- iter->count -= offset;
iter->iov_offset = offset;
} else {
unsigned long seg_skip;
@@ -916,7 +927,6 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
iter->bvec += seg_skip;
iter->nr_segs -= seg_skip;
- iter->count -= bvec->bv_len + offset;
iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1);
}
}
@@ -924,6 +934,16 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
return 0;
}
+/* Lock two rings at once. The rings must be different! */
+static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
+{
+ if (ctx1 > ctx2)
+ swap(ctx1, ctx2);
+ mutex_lock(&ctx1->uring_lock);
+ mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
+}
+
+/* Both rings are locked by the caller. */
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
struct io_uring_clone_buffers *arg)
{
@@ -931,6 +951,9 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
int i, ret, off, nr;
unsigned int nbufs;
+ lockdep_assert_held(&ctx->uring_lock);
+ lockdep_assert_held(&src_ctx->uring_lock);
+
/*
* Accounting state is shared between the two rings; that only works if
* both rings are accounted towards the same counters.
@@ -945,7 +968,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
return -EBUSY;
- nbufs = READ_ONCE(src_ctx->buf_table.nr);
+ nbufs = src_ctx->buf_table.nr;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
@@ -969,27 +992,20 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
}
}
- /*
- * Drop our own lock here. We'll setup the data we need and reference
- * the source buffers, then re-grab, check, and assign at the end.
- */
- mutex_unlock(&ctx->uring_lock);
-
- mutex_lock(&src_ctx->uring_lock);
ret = -ENXIO;
nbufs = src_ctx->buf_table.nr;
if (!nbufs)
- goto out_unlock;
+ goto out_free;
ret = -EINVAL;
if (!arg->nr)
arg->nr = nbufs;
else if (arg->nr > nbufs)
- goto out_unlock;
+ goto out_free;
ret = -EOVERFLOW;
if (check_add_overflow(arg->nr, arg->src_off, &off))
- goto out_unlock;
+ goto out_free;
if (off > nbufs)
- goto out_unlock;
+ goto out_free;
off = arg->dst_off;
i = arg->src_off;
@@ -1001,10 +1017,10 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
if (!src_node) {
dst_node = NULL;
} else {
- dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
+ dst_node = io_rsrc_node_alloc(IORING_RSRC_BUFFER);
if (!dst_node) {
ret = -ENOMEM;
- goto out_unlock;
+ goto out_free;
}
refcount_inc(&src_node->buf->refs);
@@ -1014,10 +1030,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
i++;
}
- /* Have a ref on the bufs now, drop src lock and re-grab our own lock */
- mutex_unlock(&src_ctx->uring_lock);
- mutex_lock(&ctx->uring_lock);
-
/*
* If asked for replace, put the old table. data->nodes[] holds both
* old and new nodes at this point.
@@ -1026,24 +1038,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
io_rsrc_data_free(ctx, &ctx->buf_table);
/*
- * ctx->buf_table should be empty now - either the contents are being
- * replaced and we just freed the table, or someone raced setting up
- * a buffer table while the clone was happening. If not empty, fall
- * through to failure handling.
+ * ctx->buf_table must be empty now - either the contents are being
+ * replaced and we just freed the table, or the contents are being
+ * copied to a ring that does not have buffers yet (checked at function
+ * entry).
*/
- if (!ctx->buf_table.nr) {
- ctx->buf_table = data;
- return 0;
- }
+ WARN_ON_ONCE(ctx->buf_table.nr);
+ ctx->buf_table = data;
+ return 0;
- mutex_unlock(&ctx->uring_lock);
- mutex_lock(&src_ctx->uring_lock);
- /* someone raced setting up buffers, dump ours */
- ret = -EBUSY;
-out_unlock:
+out_free:
io_rsrc_data_free(ctx, &data);
- mutex_unlock(&src_ctx->uring_lock);
- mutex_lock(&ctx->uring_lock);
return ret;
}
@@ -1057,6 +1062,7 @@ out_unlock:
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_clone_buffers buf;
+ struct io_ring_ctx *src_ctx;
bool registered_src;
struct file *file;
int ret;
@@ -1074,8 +1080,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
file = io_uring_register_get_file(buf.src_fd, registered_src);
if (IS_ERR(file))
return PTR_ERR(file);
- ret = io_clone_buffers(ctx, file->private_data, &buf);
- if (!registered_src)
- fput(file);
+
+ src_ctx = file->private_data;
+ if (src_ctx != ctx) {
+ mutex_unlock(&ctx->uring_lock);
+ lock_two_rings(ctx, src_ctx);
+ }
+
+ ret = io_clone_buffers(ctx, src_ctx, &buf);
+
+ if (src_ctx != ctx)
+ mutex_unlock(&src_ctx->uring_lock);
+
+ fput(file);
return ret;
}