summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/bpf_filter.c2
-rw-r--r--io_uring/eventfd.c10
-rw-r--r--io_uring/fdinfo.c4
-rw-r--r--io_uring/io_uring.c4
-rw-r--r--io_uring/kbuf.c27
-rw-r--r--io_uring/poll.c9
-rw-r--r--io_uring/register.c15
-rw-r--r--io_uring/tw.c22
8 files changed, 76 insertions, 17 deletions
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c
index 6a98750e38b0..c0037632b7af 100644
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -85,7 +85,7 @@ int __io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
do {
if (filter == &dummy_filter)
return -EACCES;
- ret = bpf_prog_run(filter->prog, &bpf_ctx);
+ ret = bpf_prog_run_pin_on_cpu(filter->prog, &bpf_ctx);
if (!ret)
return -EACCES;
filter = filter->next;
diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c
index cbea1c289485..7482a7dc6b38 100644
--- a/io_uring/eventfd.c
+++ b/io_uring/eventfd.c
@@ -76,11 +76,15 @@ void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event)
{
bool skip = false;
struct io_ev_fd *ev_fd;
-
- if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
- return;
+ struct io_rings *rings;
guard(rcu)();
+
+ rings = rcu_dereference(ctx->rings_rcu);
+ if (!rings)
+ return;
+ if (READ_ONCE(rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED)
+ return;
ev_fd = rcu_dereference(ctx->io_ev_fd);
/*
* Check again if ev_fd exists in case an io_eventfd_unregister call
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
index 80178b69e05a..c2d3e45544bb 100644
--- a/io_uring/fdinfo.c
+++ b/io_uring/fdinfo.c
@@ -119,12 +119,14 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
sq_idx);
break;
}
- if ((++sq_head & sq_mask) == 0) {
+ if (sq_idx == sq_mask) {
seq_printf(m,
"%5u: corrupted sqe, wrapping 128B entry\n",
sq_idx);
break;
}
+ sq_head++;
+ i++;
sqe128 = true;
}
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index aa95703165f1..9a37035e76c0 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1745,7 +1745,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
* well as 2 contiguous entries.
*/
if (!(ctx->flags & IORING_SETUP_SQE_MIXED) || *left < 2 ||
- !(ctx->cached_sq_head & (ctx->sq_entries - 1)))
+ (unsigned)(sqe - ctx->sq_sqes) >= ctx->sq_entries - 1)
return io_init_fail_req(req, -EINVAL);
/*
* A 128b operation on a mixed SQ uses two entries, so we have
@@ -2066,6 +2066,7 @@ static void io_rings_free(struct io_ring_ctx *ctx)
io_free_region(ctx->user, &ctx->sq_region);
io_free_region(ctx->user, &ctx->ring_region);
ctx->rings = NULL;
+ RCU_INIT_POINTER(ctx->rings_rcu, NULL);
ctx->sq_sqes = NULL;
}
@@ -2703,6 +2704,7 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (ret)
return ret;
ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
+ rcu_assign_pointer(ctx->rings_rcu, rings);
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + rl->sq_array_offset);
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index 2ffa95b1c601..5257b3aad395 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -34,6 +34,10 @@ struct io_provide_buf {
static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len)
{
+ /* No data consumed, return false early to avoid consuming the buffer */
+ if (!len)
+ return false;
+
while (len) {
struct io_uring_buf *buf;
u32 buf_len, this_len;
@@ -111,9 +115,18 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags)
buf = req->kbuf;
bl = io_buffer_get_list(ctx, buf->bgid);
- list_add(&buf->list, &bl->buf_list);
- bl->nbufs++;
+ /*
+ * If the buffer list was upgraded to a ring-based one, or removed,
+ * while the request was in-flight in io-wq, drop it.
+ */
+ if (bl && !(bl->flags & IOBL_BUF_RING)) {
+ list_add(&buf->list, &bl->buf_list);
+ bl->nbufs++;
+ } else {
+ kfree(buf);
+ }
req->flags &= ~REQ_F_BUFFER_SELECTED;
+ req->kbuf = NULL;
io_ring_submit_unlock(ctx, issue_flags);
return true;
@@ -203,7 +216,8 @@ static struct io_br_sel io_ring_buffer_select(struct io_kiocb *req, size_t *len,
sel.addr = u64_to_user_ptr(READ_ONCE(buf->addr));
if (io_should_commit(req, issue_flags)) {
- io_kbuf_commit(req, sel.buf_list, *len, 1);
+ if (!io_kbuf_commit(req, sel.buf_list, *len, 1))
+ req->flags |= REQ_F_BUF_MORE;
sel.buf_list = NULL;
}
return sel;
@@ -336,7 +350,8 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg,
*/
if (ret > 0) {
req->flags |= REQ_F_BUFFERS_COMMIT | REQ_F_BL_NO_RECYCLE;
- io_kbuf_commit(req, sel->buf_list, arg->out_len, ret);
+ if (!io_kbuf_commit(req, sel->buf_list, arg->out_len, ret))
+ req->flags |= REQ_F_BUF_MORE;
}
} else {
ret = io_provided_buffers_select(req, &arg->out_len, sel->buf_list, arg->iovs);
@@ -382,8 +397,10 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req,
if (bl)
ret = io_kbuf_commit(req, bl, len, nr);
+ if (ret && (req->flags & REQ_F_BUF_MORE))
+ ret = false;
- req->flags &= ~REQ_F_BUFFER_RING;
+ req->flags &= ~(REQ_F_BUFFER_RING | REQ_F_BUF_MORE);
return ret;
}
diff --git a/io_uring/poll.c b/io_uring/poll.c
index b671b84657d9..2e9ee47d74bf 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -272,6 +272,7 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs);
v &= ~IO_POLL_RETRY_FLAG;
}
+ v &= IO_POLL_REF_MASK;
}
/* the mask was stashed in __io_poll_execute */
@@ -304,8 +305,13 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
return IOU_POLL_REMOVE_POLL_USE_RES;
}
} else {
- int ret = io_poll_issue(req, tw);
+ int ret;
+ /* multiple refs and HUP, ensure we loop once more */
+ if ((req->cqe.res & (POLLHUP | POLLRDHUP)) && v != 1)
+ v--;
+
+ ret = io_poll_issue(req, tw);
if (ret == IOU_COMPLETE)
return IOU_POLL_REMOVE_POLL_USE_RES;
else if (ret == IOU_REQUEUE)
@@ -321,7 +327,6 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
* Release all references, retry if someone tried to restart
* task_work while we were executing it.
*/
- v &= IO_POLL_REF_MASK;
} while (atomic_sub_return(v, &req->poll_refs) & IO_POLL_REF_MASK);
io_napi_add(req);
diff --git a/io_uring/register.c b/io_uring/register.c
index 6015a3e9ce69..0148735f7711 100644
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -202,7 +202,7 @@ static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
return -EPERM;
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
- * is true and we're not CAP_SYS_ADMIN.
+ * is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -238,7 +238,7 @@ static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
/*
* Similar to seccomp, disallow setting a filter if task_no_new_privs
- * is true and we're not CAP_SYS_ADMIN.
+ * is false and we're not CAP_SYS_ADMIN.
*/
if (!task_no_new_privs(current) &&
!ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
@@ -633,7 +633,15 @@ overflow:
ctx->sq_entries = p->sq_entries;
ctx->cq_entries = p->cq_entries;
+ /*
+ * Just mark any flag we may have missed and that the application
+ * should act on unconditionally. Worst case it'll be an extra
+ * syscall.
+ */
+ atomic_or(IORING_SQ_TASKRUN | IORING_SQ_NEED_WAKEUP, &n.rings->sq_flags);
ctx->rings = n.rings;
+ rcu_assign_pointer(ctx->rings_rcu, n.rings);
+
ctx->sq_sqes = n.sq_sqes;
swap_old(ctx, o, n, ring_region);
swap_old(ctx, o, n, sq_region);
@@ -642,6 +650,9 @@ overflow:
out:
spin_unlock(&ctx->completion_lock);
mutex_unlock(&ctx->mmap_lock);
+ /* Wait for concurrent io_ctx_mark_taskrun() */
+ if (to_free == &o)
+ synchronize_rcu_expedited();
io_register_free_rings(ctx, to_free);
if (ctx->sq_data)
diff --git a/io_uring/tw.c b/io_uring/tw.c
index 1ee2b8ab07c8..2f2b4ac4b126 100644
--- a/io_uring/tw.c
+++ b/io_uring/tw.c
@@ -152,6 +152,21 @@ void tctx_task_work(struct callback_head *cb)
WARN_ON_ONCE(ret);
}
+/*
+ * Sets IORING_SQ_TASKRUN in the sq_flags shared with userspace, using the
+ * RCU protected rings pointer to be safe against concurrent ring resizing.
+ */
+static void io_ctx_mark_taskrun(struct io_ring_ctx *ctx)
+{
+ lockdep_assert_in_rcu_read_lock();
+
+ if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) {
+ struct io_rings *rings = rcu_dereference(ctx->rings_rcu);
+
+ atomic_or(IORING_SQ_TASKRUN, &rings->sq_flags);
+ }
+}
+
void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
{
struct io_ring_ctx *ctx = req->ctx;
@@ -206,8 +221,7 @@ void io_req_local_work_add(struct io_kiocb *req, unsigned flags)
*/
if (!head) {
- if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
- atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);
+ io_ctx_mark_taskrun(ctx);
if (ctx->has_evfd)
io_eventfd_signal(ctx, false);
}
@@ -231,6 +245,10 @@ void io_req_normal_work_add(struct io_kiocb *req)
if (!llist_add(&req->io_task_work.node, &tctx->task_list))
return;
+ /*
+ * Doesn't need to use ->rings_rcu, as resizing isn't supported for
+ * !DEFER_TASKRUN.
+ */
if (ctx->flags & IORING_SETUP_TASKRUN_FLAG)
atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags);