summaryrefslogtreecommitdiff
path: root/io_uring/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/io_uring.c')
-rw-r--r--io_uring/io_uring.c183
1 files changed, 86 insertions, 97 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 06ff41484e29..f7acae5f7e1d 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -115,7 +115,7 @@
REQ_F_ASYNC_DATA)
#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\
- IO_REQ_CLEAN_FLAGS)
+ REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS)
#define IO_TCTX_REFS_CACHE_NR (1U << 10)
@@ -143,7 +143,8 @@ struct io_defer_entry {
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all);
+ bool cancel_all,
+ bool is_sqpoll_thread);
static void io_queue_sqe(struct io_kiocb *req);
@@ -156,7 +157,7 @@ static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = -1;
#ifdef CONFIG_SYSCTL
-static struct ctl_table kernel_io_uring_disabled_table[] = {
+static const struct ctl_table kernel_io_uring_disabled_table[] = {
{
.procname = "io_uring_disabled",
.data = &sysctl_io_uring_disabled,
@@ -215,9 +216,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx,
struct io_ring_ctx *ctx = head->ctx;
/* protect against races with linked timeouts */
- spin_lock_irq(&ctx->timeout_lock);
+ raw_spin_lock_irq(&ctx->timeout_lock);
matched = io_match_linked(head);
- spin_unlock_irq(&ctx->timeout_lock);
+ raw_spin_unlock_irq(&ctx->timeout_lock);
} else {
matched = io_match_linked(head);
}
@@ -314,16 +315,18 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_LIST_HEAD(&ctx->cq_overflow_list);
INIT_LIST_HEAD(&ctx->io_buffers_cache);
ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX,
- sizeof(struct async_poll));
+ sizeof(struct async_poll), 0);
ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_async_msghdr));
+ sizeof(struct io_async_msghdr),
+ offsetof(struct io_async_msghdr, clear));
ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_async_rw));
+ sizeof(struct io_async_rw),
+ offsetof(struct io_async_rw, clear));
ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct uring_cache));
+ sizeof(struct io_uring_cmd_data), 0);
spin_lock_init(&ctx->msg_lock);
ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX,
- sizeof(struct io_kiocb));
+ sizeof(struct io_kiocb), 0);
ret |= io_futex_cache_init(ctx);
if (ret)
goto free_ref;
@@ -333,7 +336,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->cq_wait);
init_waitqueue_head(&ctx->poll_wq);
spin_lock_init(&ctx->completion_lock);
- spin_lock_init(&ctx->timeout_lock);
+ raw_spin_lock_init(&ctx->timeout_lock);
INIT_WQ_LIST(&ctx->iopoll_list);
INIT_LIST_HEAD(&ctx->io_buffers_comp);
INIT_LIST_HEAD(&ctx->defer_list);
@@ -350,7 +353,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
INIT_WQ_LIST(&ctx->submit_state.compl_reqs);
INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd);
io_napi_init(ctx);
- mutex_init(&ctx->resize_lock);
+ mutex_init(&ctx->mmap_lock);
return ctx;
@@ -361,7 +364,7 @@ err:
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
kvfree(ctx->cancel_table.hbs);
xa_destroy(&ctx->io_bl_xa);
@@ -498,10 +501,10 @@ static void io_prep_async_link(struct io_kiocb *req)
if (req->flags & REQ_F_LINK_TIMEOUT) {
struct io_ring_ctx *ctx = req->ctx;
- spin_lock_irq(&ctx->timeout_lock);
+ raw_spin_lock_irq(&ctx->timeout_lock);
io_for_each_link(cur, req)
io_prep_async_work(cur);
- spin_unlock_irq(&ctx->timeout_lock);
+ raw_spin_unlock_irq(&ctx->timeout_lock);
} else {
io_for_each_link(cur, req)
io_prep_async_work(cur);
@@ -514,7 +517,11 @@ static void io_queue_iowq(struct io_kiocb *req)
struct io_uring_task *tctx = req->tctx;
BUG_ON(!tctx);
- BUG_ON(!tctx->io_wq);
+
+ if ((current->flags & PF_KTHREAD) || !tctx->io_wq) {
+ io_req_task_queue_fail(req, -ECANCELED);
+ return;
+ }
/* init ->work of the whole link before punting */
io_prep_async_link(req);
@@ -546,8 +553,9 @@ void io_req_queue_iowq(struct io_kiocb *req)
io_req_task_work_add(req);
}
-static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
+static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
{
+ spin_lock(&ctx->completion_lock);
while (!list_empty(&ctx->defer_list)) {
struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
struct io_defer_entry, list);
@@ -558,6 +566,7 @@ static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
io_req_task_queue(de->req);
kfree(de);
}
+ spin_unlock(&ctx->completion_lock);
}
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
@@ -566,11 +575,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
io_poll_wq_wake(ctx);
if (ctx->off_timeout_used)
io_flush_timeouts(ctx);
- if (ctx->drain_active) {
- spin_lock(&ctx->completion_lock);
+ if (ctx->drain_active)
io_queue_deferred(ctx);
- spin_unlock(&ctx->completion_lock);
- }
if (ctx->has_evfd)
io_eventfd_flush_signal(ctx);
}
@@ -1222,10 +1228,7 @@ static void io_req_normal_work_add(struct io_kiocb *req)
/* SQPOLL doesn't need the task_work added, it'll run it itself */
if (ctx->flags & IORING_SETUP_SQPOLL) {
- struct io_sq_data *sqd = ctx->sq_data;
-
- if (sqd->thread)
- __set_notify_signal(sqd->thread);
+ __set_notify_signal(tctx->task);
return;
}
@@ -1400,6 +1403,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
comp_list);
if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) {
+ if (req->flags & REQ_F_REISSUE) {
+ node = req->comp_list.next;
+ req->flags &= ~REQ_F_REISSUE;
+ io_queue_iowq(req);
+ continue;
+ }
if (req->flags & REQ_F_REFCOUNT) {
node = req->comp_list.next;
if (!req_ref_put_and_test(req))
@@ -1439,7 +1448,12 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx)
struct io_kiocb *req = container_of(node, struct io_kiocb,
comp_list);
- if (!(req->flags & REQ_F_CQE_SKIP) &&
+ /*
+ * Requests marked with REQUEUE should not post a CQE, they
+ * will go through the io-wq retry machinery and post one
+ * later.
+ */
+ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) &&
unlikely(!io_fill_cqe_req(ctx, req))) {
if (ctx->lockless_cq) {
spin_lock(&ctx->completion_lock);
@@ -1639,19 +1653,6 @@ io_req_flags_t io_file_get_flags(struct file *file)
return res;
}
-bool io_alloc_async_data(struct io_kiocb *req)
-{
- const struct io_issue_def *def = &io_issue_defs[req->opcode];
-
- WARN_ON_ONCE(!def->async_size);
- req->async_data = kmalloc(def->async_size, GFP_KERNEL);
- if (req->async_data) {
- req->flags |= REQ_F_ASYNC_DATA;
- return false;
- }
- return true;
-}
-
static u32 io_get_sequence(struct io_kiocb *req)
{
u32 seq = req->ctx->cached_sq_head;
@@ -2044,6 +2045,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
req->opcode = 0;
return io_init_fail_req(req, -EINVAL);
}
+ opcode = array_index_nospec(opcode, IORING_OP_LAST);
+
def = &io_issue_defs[opcode];
if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) {
/* enforce forwards compatibility on users */
@@ -2630,36 +2633,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0;
}
-static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr,
- size);
-}
-
-static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
- size_t size)
-{
- return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr,
- size);
-}
-
static void io_rings_free(struct io_ring_ctx *ctx)
{
- if (!(ctx->flags & IORING_SETUP_NO_MMAP)) {
- io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages,
- true);
- io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages,
- true);
- } else {
- io_pages_free(&ctx->ring_pages, ctx->n_ring_pages);
- ctx->n_ring_pages = 0;
- io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages);
- ctx->n_sqe_pages = 0;
- vunmap(ctx->rings);
- vunmap(ctx->sq_sqes);
- }
-
+ io_free_region(ctx, &ctx->sq_region);
+ io_free_region(ctx, &ctx->ring_region);
ctx->rings = NULL;
ctx->sq_sqes = NULL;
}
@@ -2731,7 +2708,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free);
io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free);
io_alloc_cache_free(&ctx->uring_cache, kfree);
- io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free);
+ io_alloc_cache_free(&ctx->msg_cache, kfree);
io_futex_cache_free(ctx);
io_destroy_buffers(ctx);
io_free_region(ctx, &ctx->param_region);
@@ -2809,13 +2786,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait)
if (unlikely(!ctx->poll_activated))
io_activate_pollwq(ctx);
-
- poll_wait(file, &ctx->poll_wq, wait);
/*
- * synchronizes with barrier from wq_has_sleeper call in
- * io_commit_cqring
+ * provides mb() which pairs with barrier from wq_has_sleeper
+ * call in io_commit_cqring
*/
- smp_rmb();
+ poll_wait(file, &ctx->poll_wq, wait);
+
if (!io_sqring_full(ctx))
mask |= EPOLLOUT | EPOLLWRNORM;
@@ -2894,7 +2870,8 @@ static __cold void io_ring_exit_work(struct work_struct *work)
if (ctx->flags & IORING_SETUP_DEFER_TASKRUN)
io_move_task_work_from_local(ctx);
- while (io_uring_try_cancel_requests(ctx, NULL, true))
+ /* The SQPOLL thread never reaches this path */
+ while (io_uring_try_cancel_requests(ctx, NULL, true, false))
cond_resched();
if (ctx->sq_data) {
@@ -3062,7 +3039,8 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct io_uring_task *tctx,
- bool cancel_all)
+ bool cancel_all,
+ bool is_sqpoll_thread)
{
struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, };
enum io_wq_cancel cret;
@@ -3092,7 +3070,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
/* SQPOLL thread does its own polling */
if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) ||
- (ctx->sq_data && ctx->sq_data->thread == current)) {
+ is_sqpoll_thread) {
while (!wq_list_empty(&ctx->iopoll_list)) {
io_iopoll_try_reap_events(ctx);
ret = true;
@@ -3165,13 +3143,15 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
continue;
loop |= io_uring_try_cancel_requests(node->ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ false);
}
} else {
list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
loop |= io_uring_try_cancel_requests(ctx,
current->io_uring,
- cancel_all);
+ cancel_all,
+ true);
}
if (loop) {
@@ -3214,6 +3194,7 @@ end_wait:
void __io_uring_cancel(bool cancel_all)
{
+ io_uring_unreg_ringfd();
io_uring_cancel_generic(cancel_all, NULL);
}
@@ -3232,6 +3213,7 @@ static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx,
end > ctx->cq_wait_size))
return ERR_PTR(-EFAULT);
+ offset = array_index_nospec(offset, ctx->cq_wait_size - size);
return ctx->cq_wait_arg + offset;
}
@@ -3476,9 +3458,10 @@ bool io_is_uring_fops(struct file *file)
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
+ struct io_uring_region_desc rd;
struct io_rings *rings;
size_t size, sq_array_offset;
- void *ptr;
+ int ret;
/* make sure these are sane, as we already accounted them */
ctx->sq_entries = p->sq_entries;
@@ -3489,15 +3472,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
if (size == SIZE_MAX)
return -EOVERFLOW;
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size);
- else
- rings = io_rings_map(ctx, p->cq_off.user_addr, size);
-
- if (IS_ERR(rings))
- return PTR_ERR(rings);
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->cq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING);
+ if (ret)
+ return ret;
+ ctx->rings = rings = io_region_get_ptr(&ctx->ring_region);
- ctx->rings = rings;
if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
ctx->sq_array = (u32 *)((char *)rings + sq_array_offset);
rings->sq_ring_mask = p->sq_entries - 1;
@@ -3514,17 +3499,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
return -EOVERFLOW;
}
- if (!(ctx->flags & IORING_SETUP_NO_MMAP))
- ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size);
- else
- ptr = io_sqes_map(ctx, p->sq_off.user_addr, size);
-
- if (IS_ERR(ptr)) {
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(size);
+ if (ctx->flags & IORING_SETUP_NO_MMAP) {
+ rd.user_addr = p->sq_off.user_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES);
+ if (ret) {
io_rings_free(ctx);
- return PTR_ERR(ptr);
+ return ret;
}
-
- ctx->sq_sqes = ptr;
+ ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region);
return 0;
}
@@ -3732,7 +3718,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS |
IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP |
IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING |
- IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT;
+ IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT |
+ IORING_FEAT_RW_ATTR;
if (copy_to_user(params, p, sizeof(*p))) {
ret = -EFAULT;
@@ -3893,6 +3880,8 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
+ BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr);
+ BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=