summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--io_uring/alloc_cache.h2
-rw-r--r--io_uring/futex.c4
-rw-r--r--io_uring/io_uring.c3
-rw-r--r--io_uring/memmap.c46
-rw-r--r--io_uring/poll.c6
-rw-r--r--io_uring/register.c36
-rw-r--r--io_uring/rsrc.c5
-rw-r--r--io_uring/rsrc.h9
-rw-r--r--io_uring/rw.c4
-rw-r--r--io_uring/tctx.c15
-rw-r--r--io_uring/zcrx.c5
11 files changed, 111 insertions, 24 deletions
diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h
index 45fcd8b3b824..962b6e2d04cc 100644
--- a/io_uring/alloc_cache.h
+++ b/io_uring/alloc_cache.h
@@ -64,7 +64,7 @@ static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp)
static inline void io_cache_free(struct io_alloc_cache *cache, void *obj)
{
if (!io_alloc_cache_put(cache, obj))
- kfree(obj);
+ kvfree(obj);
}
#endif
diff --git a/io_uring/futex.c b/io_uring/futex.c
index fd503c24b428..9cc1788ef4c6 100644
--- a/io_uring/futex.c
+++ b/io_uring/futex.c
@@ -159,8 +159,10 @@ static void io_futex_wakev_fn(struct wake_q_head *wake_q, struct futex_q *q)
struct io_kiocb *req = q->wake_data;
struct io_futexv_data *ifd = req->async_data;
- if (!io_futexv_claim(ifd))
+ if (!io_futexv_claim(ifd)) {
+ __futex_wake_mark(q);
return;
+ }
if (unlikely(!__futex_wake_mark(q)))
return;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index dd6326dc5f88..4ed998d60c09 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2575,7 +2575,8 @@ struct file *io_uring_ctx_get_file(unsigned int fd, bool registered)
return ERR_PTR(-EBADF);
if (io_is_uring_fops(file))
return file;
- fput(file);
+ if (!registered)
+ fput(file);
return ERR_PTR(-EOPNOTSUPP);
}
diff --git a/io_uring/memmap.c b/io_uring/memmap.c
index e6958968975a..4f9b439319c4 100644
--- a/io_uring/memmap.c
+++ b/io_uring/memmap.c
@@ -366,9 +366,53 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr,
#else /* !CONFIG_MMU */
+/*
+ * Drop the pages that were initially referenced and added in
+ * io_uring_mmap(). We cannot have had a mremap() as that isn't supported,
+ * hence the vma should be identical to the one we initially referenced and
+ * mapped, and partial unmaps and splitting isn't possible on a file backed
+ * mapping.
+ */
+static void io_uring_nommu_vm_close(struct vm_area_struct *vma)
+{
+ unsigned long index;
+
+ for (index = vma->vm_start; index < vma->vm_end; index += PAGE_SIZE)
+ put_page(virt_to_page((void *) index));
+}
+
+static const struct vm_operations_struct io_uring_nommu_vm_ops = {
+ .close = io_uring_nommu_vm_close,
+};
+
int io_uring_mmap(struct file *file, struct vm_area_struct *vma)
{
- return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL;
+ struct io_ring_ctx *ctx = file->private_data;
+ struct io_mapped_region *region;
+ unsigned long i;
+
+ if (!is_nommu_shared_mapping(vma->vm_flags))
+ return -EINVAL;
+
+ guard(mutex)(&ctx->mmap_lock);
+ region = io_mmap_get_region(ctx, vma->vm_pgoff);
+ if (!region || !io_region_is_set(region))
+ return -EINVAL;
+
+ if ((vma->vm_end - vma->vm_start) !=
+ (unsigned long) region->nr_pages << PAGE_SHIFT)
+ return -EINVAL;
+
+ /*
+ * Pin the pages so io_free_region()'s release_pages() does not
+ * drop the last reference while this VMA exists. delete_vma()
+ * in mm/nommu.c calls vma_close() which runs ->close above.
+ */
+ for (i = 0; i < region->nr_pages; i++)
+ get_page(region->pages[i]);
+
+ vma->vm_ops = &io_uring_nommu_vm_ops;
+ return 0;
}
unsigned int io_uring_nommu_mmap_capabilities(struct file *file)
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 74eef7884159..0204affdc308 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -93,7 +93,7 @@ static bool io_poll_get_ownership_slowpath(struct io_kiocb *req)
*/
static inline bool io_poll_get_ownership(struct io_kiocb *req)
{
- if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
+ if (unlikely((unsigned int)atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS))
return io_poll_get_ownership_slowpath(req);
return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK);
}
@@ -417,8 +417,10 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* disable multishot as there is a circular dependency between
* CQ posting and triggering the event.
*/
- if (mask & EPOLL_URING_WAKE)
+ if (mask & EPOLL_URING_WAKE) {
poll->events |= EPOLLONESHOT;
+ req->apoll_events |= EPOLLONESHOT;
+ }
/* optional, saves extra locking for removal in tw handler */
if (mask && poll->events & EPOLLONESHOT) {
diff --git a/io_uring/register.c b/io_uring/register.c
index 24e593332d1a..dce5e2f9cf77 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -599,10 +599,20 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
if (tail - old_head > p->sq_entries)
goto overflow;
for (i = old_head; i < tail; i++) {
- unsigned src_head = i & (ctx->sq_entries - 1);
- unsigned dst_head = i & (p->sq_entries - 1);
-
- n.sq_sqes[dst_head] = o.sq_sqes[src_head];
+ unsigned index, dst_mask, src_mask;
+ size_t sq_size;
+
+ index = i;
+ sq_size = sizeof(struct io_uring_sqe);
+ src_mask = ctx->sq_entries - 1;
+ dst_mask = p->sq_entries - 1;
+ if (ctx->flags & IORING_SETUP_SQE128) {
+ index <<= 1;
+ sq_size <<= 1;
+ src_mask = (ctx->sq_entries << 1) - 1;
+ dst_mask = (p->sq_entries << 1) - 1;
+ }
+ memcpy(&n.sq_sqes[index & dst_mask], &o.sq_sqes[index & src_mask], sq_size);
}
WRITE_ONCE(n.rings->sq.head, old_head);
WRITE_ONCE(n.rings->sq.tail, tail);
@@ -619,10 +629,20 @@ overflow:
goto out;
}
for (i = old_head; i < tail; i++) {
- unsigned src_head = i & (ctx->cq_entries - 1);
- unsigned dst_head = i & (p->cq_entries - 1);
-
- n.rings->cqes[dst_head] = o.rings->cqes[src_head];
+ unsigned index, dst_mask, src_mask;
+ size_t cq_size;
+
+ index = i;
+ cq_size = sizeof(struct io_uring_cqe);
+ src_mask = ctx->cq_entries - 1;
+ dst_mask = p->cq_entries - 1;
+ if (ctx->flags & IORING_SETUP_CQE32) {
+ index <<= 1;
+ cq_size <<= 1;
+ src_mask = (ctx->cq_entries << 1) - 1;
+ dst_mask = (p->cq_entries << 1) - 1;
+ }
+ memcpy(&n.rings->cqes[index & dst_mask], &o.rings->cqes[index & src_mask], cq_size);
}
WRITE_ONCE(n.rings->cq.head, old_head);
WRITE_ONCE(n.rings->cq.tail, tail);
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index fd36e0e319a2..650303626be6 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -168,7 +168,7 @@ bool io_rsrc_cache_init(struct io_ring_ctx *ctx)
void io_rsrc_cache_free(struct io_ring_ctx *ctx)
{
io_alloc_cache_free(&ctx->node_cache, kfree);
- io_alloc_cache_free(&ctx->imu_cache, kfree);
+ io_alloc_cache_free(&ctx->imu_cache, kvfree);
}
static void io_clear_table_tags(struct io_rsrc_data *data)
@@ -238,6 +238,9 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
continue;
i = up->offset + done;
+ if (i >= ctx->file_table.data.nr)
+ break;
+ i = array_index_nospec(i, ctx->file_table.data.nr);
if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i))
io_file_bitmap_clear(&ctx->file_table, i);
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index cff0f8834c35..44e3386f7c1c 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -109,10 +109,15 @@ static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node
}
static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx,
- struct io_rsrc_data *data, int index)
+ struct io_rsrc_data *data,
+ unsigned int index)
{
- struct io_rsrc_node *node = data->nodes[index];
+ struct io_rsrc_node *node;
+ if (index >= data->nr)
+ return false;
+ index = array_index_nospec(index, data->nr);
+ node = data->nodes[index];
if (!node)
return false;
io_put_rsrc_node(ctx, node);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 20654deff84d..e729e0e7657e 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -580,7 +580,7 @@ void io_req_rw_complete(struct io_tw_req tw_req, io_tw_token_t tw)
io_req_io_end(req);
if (req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))
- req->cqe.flags |= io_put_kbuf(req, req->cqe.res, NULL);
+ req->cqe.flags |= io_put_kbuf(req, max(req->cqe.res, 0), NULL);
io_req_rw_cleanup(req, 0);
io_req_task_complete(tw_req, tw);
@@ -1379,7 +1379,7 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
list_del(&req->iopoll_node);
wq_list_add_tail(&req->comp_list, &ctx->submit_state.compl_reqs);
nr_events++;
- req->cqe.flags = io_put_kbuf(req, req->cqe.res, NULL);
+ req->cqe.flags = io_put_kbuf(req, max(req->cqe.res, 0), NULL);
if (!io_is_uring_cmd(req))
io_req_rw_cleanup(req, 0);
}
diff --git a/io_uring/tctx.c b/io_uring/tctx.c
index 61533f30494f..6af62ca9baba 100644
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -146,9 +146,13 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
if (IS_ERR(tctx))
return PTR_ERR(tctx);
- if (ctx->int_flags & IO_RING_F_IOWQ_LIMITS_SET) {
- unsigned int limits[2] = { ctx->iowq_limits[0],
- ctx->iowq_limits[1], };
+ if (data_race(ctx->int_flags) & IO_RING_F_IOWQ_LIMITS_SET) {
+ unsigned int limits[2];
+
+ mutex_lock(&ctx->uring_lock);
+ limits[0] = ctx->iowq_limits[0];
+ limits[1] = ctx->iowq_limits[1];
+ mutex_unlock(&ctx->uring_lock);
ret = io_wq_max_workers(tctx->io_wq, limits);
if (ret)
@@ -171,7 +175,10 @@ int __io_uring_add_tctx_node(struct io_ring_ctx *ctx)
}
if (!current->io_uring) {
err_free:
- io_wq_put_and_exit(tctx->io_wq);
+ if (tctx->io_wq) {
+ io_wq_exit_start(tctx->io_wq);
+ io_wq_put_and_exit(tctx->io_wq);
+ }
percpu_counter_destroy(&tctx->inflight);
kfree(tctx);
}
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 9a83d7eb4210..7b93c87b8371 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -396,6 +396,7 @@ static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx,
ifq->rq.ring = (struct io_uring *)ptr;
ifq->rq.rqes = (struct io_uring_zcrx_rqe *)(ptr + off);
+ memset(ifq->rq.ring, 0, sizeof(*ifq->rq.ring));
return 0;
}
@@ -579,13 +580,13 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq)
if (ifq->area)
io_zcrx_free_area(ifq, ifq->area);
- free_uid(ifq->user);
if (ifq->mm_account)
mmdrop(ifq->mm_account);
if (ifq->dev)
put_device(ifq->dev);
io_free_rbuf_ring(ifq);
+ free_uid(ifq->user);
mutex_destroy(&ifq->pp_lock);
kfree(ifq);
}
@@ -601,6 +602,8 @@ static void io_zcrx_return_niov_freelist(struct net_iov *niov)
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
guard(spinlock_bh)(&area->freelist_lock);
+ if (WARN_ON_ONCE(area->free_count >= area->nia.num_niovs))
+ return;
area->freelist[area->free_count++] = net_iov_idx(niov);
}