summaryrefslogtreecommitdiff
path: root/fs/io_uring.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r--fs/io_uring.c94
1 files changed, 73 insertions, 21 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c
index a78201b96179..ca8abde48b6c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -890,6 +890,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
struct io_uring_files_update *ip,
unsigned nr_args);
static int io_grab_files(struct io_kiocb *req);
+static void io_complete_rw_common(struct kiocb *kiocb, long res);
static void io_cleanup_req(struct io_kiocb *req);
static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
int fd, struct file **out_file, bool fixed);
@@ -1273,6 +1274,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
if (cqe) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irqrestore(&ctx->completion_lock, flags);
io_cqring_ev_posted(ctx);
@@ -1310,6 +1312,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
if (list_empty(&ctx->cq_overflow_list)) {
set_bit(0, &ctx->sq_check_overflow);
set_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
}
req->flags |= REQ_F_OVERFLOW;
refcount_inc(&req->refs);
@@ -1749,6 +1752,14 @@ static void io_iopoll_queue(struct list_head *again)
do {
req = list_first_entry(again, struct io_kiocb, list);
list_del(&req->list);
+
+ /* shouldn't happen unless io_uring is dying, cancel reqs */
+ if (unlikely(!current->mm)) {
+ io_complete_rw_common(&req->rw.kiocb, -EAGAIN);
+ io_put_req(req);
+ continue;
+ }
+
refcount_inc(&req->refs);
io_queue_async_work(req);
} while (!list_empty(again));
@@ -1994,10 +2005,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
WRITE_ONCE(req->result, res);
/* order with io_poll_complete() checking ->result */
- if (res != -EAGAIN) {
- smp_wmb();
- WRITE_ONCE(req->iopoll_completed, 1);
- }
+ smp_wmb();
+ WRITE_ONCE(req->iopoll_completed, 1);
}
/*
@@ -4065,6 +4074,29 @@ struct io_poll_table {
int error;
};
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+ struct task_struct *tsk = req->task;
+ struct io_ring_ctx *ctx = req->ctx;
+ int ret, notify = TWA_RESUME;
+
+ /*
+ * SQPOLL kernel thread doesn't need notification, just a wakeup.
+ * If we're not using an eventfd, then TWA_RESUME is always fine,
+ * as we won't have dependencies between request completions for
+ * other kernel wait conditions.
+ */
+ if (ctx->flags & IORING_SETUP_SQPOLL)
+ notify = 0;
+ else if (ctx->cq_ev_fd)
+ notify = TWA_SIGNAL;
+
+ ret = task_work_add(tsk, cb, notify);
+ if (!ret)
+ wake_up_process(tsk);
+ return ret;
+}
+
static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
__poll_t mask, task_work_func_t func)
{
@@ -4088,13 +4120,13 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
* of executing it. We can't safely execute it anyway, as we may not
* have the needed state needed for it anyway.
*/
- ret = task_work_add(tsk, &req->task_work, true);
+ ret = io_req_task_work_add(req, &req->task_work);
if (unlikely(ret)) {
WRITE_ONCE(poll->canceled, true);
tsk = io_wq_get_task(req->ctx->io_wq);
- task_work_add(tsk, &req->task_work, true);
+ task_work_add(tsk, &req->task_work, 0);
+ wake_up_process(tsk);
}
- wake_up_process(tsk);
return 1;
}
@@ -5353,9 +5385,6 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) {
const bool in_async = io_wq_current_is_worker();
- if (req->result == -EAGAIN)
- return -EAGAIN;
-
/* workqueue context doesn't hold uring_lock, grab it now */
if (in_async)
mutex_lock(&ctx->uring_lock);
@@ -6011,7 +6040,7 @@ static int io_sq_thread(void *data)
* If submit got -EBUSY, flag us as needing the application
* to enter the kernel to reap and flush events.
*/
- if (!to_submit || ret == -EBUSY) {
+ if (!to_submit || ret == -EBUSY || need_resched()) {
/*
* Drop cur_mm before scheduling, we can't hold it for
* long periods (or over schedule()). Do this before
@@ -6027,7 +6056,7 @@ static int io_sq_thread(void *data)
* more IO, we should wait for the application to
* reap events and wake us up.
*/
- if (!list_empty(&ctx->poll_list) ||
+ if (!list_empty(&ctx->poll_list) || need_resched() ||
(!time_after(jiffies, timeout) && ret != -EBUSY &&
!percpu_ref_is_dying(&ctx->refs))) {
if (current->task_works)
@@ -6053,9 +6082,9 @@ static int io_sq_thread(void *data)
}
/* Tell userspace we may need a wakeup call */
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
- /* make sure to read SQ tail after writing flags */
- smp_mb();
+ spin_unlock_irq(&ctx->completion_lock);
to_submit = io_sqring_entries(ctx);
if (!to_submit || ret == -EBUSY) {
@@ -6073,13 +6102,17 @@ static int io_sq_thread(void *data)
schedule();
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
ret = 0;
continue;
}
finish_wait(&ctx->sqo_wait, &wait);
+ spin_lock_irq(&ctx->completion_lock);
ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+ spin_unlock_irq(&ctx->completion_lock);
}
mutex_lock(&ctx->uring_lock);
@@ -6178,15 +6211,23 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
do {
prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
TASK_INTERRUPTIBLE);
+ /* make sure we run task_work before checking for signals */
if (current->task_works)
task_work_run();
- if (io_should_wake(&iowq, false))
- break;
- schedule();
if (signal_pending(current)) {
+ if (current->jobctl & JOBCTL_TASK_WORK) {
+ spin_lock_irq(&current->sighand->siglock);
+ current->jobctl &= ~JOBCTL_TASK_WORK;
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+ continue;
+ }
ret = -EINTR;
break;
}
+ if (io_should_wake(&iowq, false))
+ break;
+ schedule();
} while (1);
finish_wait(&ctx->wait, &iowq.wq);
@@ -6658,6 +6699,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
for (i = 0; i < nr_tables; i++)
kfree(ctx->file_data->table[i].files);
+ percpu_ref_exit(&ctx->file_data->refs);
kfree(ctx->file_data->table);
kfree(ctx->file_data);
ctx->file_data = NULL;
@@ -6810,8 +6852,10 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
}
table->files[index] = file;
err = io_sqe_file_register(ctx, file, i);
- if (err)
+ if (err) {
+ fput(file);
break;
+ }
}
nr_args--;
done++;
@@ -7307,9 +7351,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_mem_free(ctx->sq_sqes);
percpu_ref_exit(&ctx->refs);
- if (ctx->account_mem)
- io_unaccount_mem(ctx->user,
- ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
put_cred(ctx->creds);
kfree(ctx->cancel_hash);
@@ -7394,6 +7435,16 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
if (ctx->rings)
io_cqring_overflow_flush(ctx, true);
idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
+
+ /*
+ * Do this upfront, so we won't have a grace period where the ring
+ * is closed but resources aren't reaped yet. This can cause
+ * spurious failure in setting up a new ring.
+ */
+ if (ctx->account_mem)
+ io_unaccount_mem(ctx->user,
+ ring_pages(ctx->sq_entries, ctx->cq_entries));
+
INIT_WORK(&ctx->exit_work, io_ring_exit_work);
queue_work(system_wq, &ctx->exit_work);
}
@@ -7453,6 +7504,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
if (list_empty(&ctx->cq_overflow_list)) {
clear_bit(0, &ctx->sq_check_overflow);
clear_bit(0, &ctx->cq_check_overflow);
+ ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
}
spin_unlock_irq(&ctx->completion_lock);