diff options
Diffstat (limited to 'io_uring/poll.c')
-rw-r--r-- | io_uring/poll.c | 156 |
1 files changed, 92 insertions, 64 deletions
diff --git a/io_uring/poll.c b/io_uring/poll.c index 055632e9092a..ee7da6150ec4 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -40,7 +40,14 @@ struct io_poll_table { }; #define IO_POLL_CANCEL_FLAG BIT(31) -#define IO_POLL_REF_MASK GENMASK(30, 0) +#define IO_POLL_RETRY_FLAG BIT(30) +#define IO_POLL_REF_MASK GENMASK(29, 0) + +/* + * We usually have 1-2 refs taken, 128 is more than enough and we want to + * maximise the margin between this amount and the moment when it overflows. + */ +#define IO_POLL_REF_BIAS 128 #define IO_WQE_F_DOUBLE 1 @@ -58,6 +65,21 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe) return priv & IO_WQE_F_DOUBLE; } +static bool io_poll_get_ownership_slowpath(struct io_kiocb *req) +{ + int v; + + /* + * poll_refs are already elevated and we don't have much hope for + * grabbing the ownership. Instead of incrementing set a retry flag + * to notify the loop that there might have been some change. + */ + v = atomic_fetch_or(IO_POLL_RETRY_FLAG, &req->poll_refs); + if (v & IO_POLL_REF_MASK) + return false; + return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); +} + /* * If refs part of ->poll_refs (see IO_POLL_REF_MASK) is 0, it's free. We can * bump it and acquire ownership. It's disallowed to modify requests while not @@ -66,6 +88,8 @@ static inline bool wqe_is_double(struct wait_queue_entry *wqe) */ static inline bool io_poll_get_ownership(struct io_kiocb *req) { + if (unlikely(atomic_read(&req->poll_refs) >= IO_POLL_REF_BIAS)) + return io_poll_get_ownership_slowpath(req); return !(atomic_fetch_inc(&req->poll_refs) & IO_POLL_REF_MASK); } @@ -213,7 +237,6 @@ enum { */ static int io_poll_check_events(struct io_kiocb *req, bool *locked) { - struct io_ring_ctx *ctx = req->ctx; int v, ret; /* req->task == current here, checking PF_EXITING is safe */ @@ -223,18 +246,31 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) do { v = atomic_read(&req->poll_refs); - /* tw handler should be the owner, and so have some references */ - if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) - return IOU_POLL_DONE; - if (v & IO_POLL_CANCEL_FLAG) - return -ECANCELED; - /* - * cqe.res contains only events of the first wake up - * and all others are be lost. Redo vfs_poll() to get - * up to date state. - */ - if ((v & IO_POLL_REF_MASK) != 1) - req->cqe.res = 0; + if (unlikely(v != 1)) { + /* tw should be the owner and so have some refs */ + if (WARN_ON_ONCE(!(v & IO_POLL_REF_MASK))) + return IOU_POLL_NO_ACTION; + if (v & IO_POLL_CANCEL_FLAG) + return -ECANCELED; + /* + * cqe.res contains only events of the first wake up + * and all others are to be lost. Redo vfs_poll() to get + * up to date state. + */ + if ((v & IO_POLL_REF_MASK) != 1) + req->cqe.res = 0; + + if (v & IO_POLL_RETRY_FLAG) { + req->cqe.res = 0; + /* + * We won't find new events that came in between + * vfs_poll and the ref put unless we clear the + * flag in advance. + */ + atomic_andnot(IO_POLL_RETRY_FLAG, &req->poll_refs); + v &= ~IO_POLL_RETRY_FLAG; + } + } /* the mask was stashed in __io_poll_execute */ if (!req->cqe.res) { @@ -246,16 +282,14 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) continue; if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; - if (io_is_uring_fops(req->file)) - return IOU_POLL_DONE; /* multishot, just fill a CQE and proceed */ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { __poll_t mask = mangle_poll(req->cqe.res & req->apoll_events); - if (!io_post_aux_cqe(ctx, req->cqe.user_data, - mask, IORING_CQE_F_MORE, false)) { + if (!io_aux_cqe(req->ctx, *locked, req->cqe.user_data, + mask, IORING_CQE_F_MORE, false)) { io_req_set_res(req, mask, 0); return IOU_POLL_REMOVE_POLL_USE_RES; } @@ -274,7 +308,8 @@ static int io_poll_check_events(struct io_kiocb *req, bool *locked) * Release all references, retry if someone tried to restart * task_work while we were executing it. */ - } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs)); + } while (atomic_sub_return(v & IO_POLL_REF_MASK, &req->poll_refs) & + IO_POLL_REF_MASK); return IOU_POLL_NO_ACTION; } @@ -286,54 +321,38 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) ret = io_poll_check_events(req, locked); if (ret == IOU_POLL_NO_ACTION) return; - - if (ret == IOU_POLL_DONE) { - struct io_poll *poll = io_kiocb_to_cmd(req, struct io_poll); - req->cqe.res = mangle_poll(req->cqe.res & poll->events); - } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { - req->cqe.res = ret; - req_set_fail(req); - } - io_poll_remove_entries(req); io_poll_tw_hash_eject(req, locked); - io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, locked); -} - -static void io_apoll_task_func(struct io_kiocb *req, bool *locked) -{ - int ret; + if (req->opcode == IORING_OP_POLL_ADD) { + if (ret == IOU_POLL_DONE) { + struct io_poll *poll; - ret = io_poll_check_events(req, locked); - if (ret == IOU_POLL_NO_ACTION) - return; - - io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, locked); + poll = io_kiocb_to_cmd(req, struct io_poll); + req->cqe.res = mangle_poll(req->cqe.res & poll->events); + } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { + req->cqe.res = ret; + req_set_fail(req); + } - if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_complete_post(req); - else if (ret == IOU_POLL_DONE) - io_req_task_submit(req, locked); - else - io_req_complete_failed(req, ret); + io_req_set_res(req, req->cqe.res, 0); + io_req_task_complete(req, locked); + } else { + io_tw_lock(req->ctx, locked); + + if (ret == IOU_POLL_REMOVE_POLL_USE_RES) + io_req_task_complete(req, locked); + else if (ret == IOU_POLL_DONE) + io_req_task_submit(req, locked); + else + io_req_defer_failed(req, ret); + } } static void __io_poll_execute(struct io_kiocb *req, int mask) { io_req_set_res(req, mask, 0); - /* - * This is useful for poll that is armed on behalf of another - * request, and where the wakeup path could be on a different - * CPU. We want to avoid pulling in req->apoll->events for that - * case. - */ - if (req->opcode == IORING_OP_POLL_ADD) - req->io_task_work.func = io_poll_task_func; - else - req->io_task_work.func = io_apoll_task_func; + req->io_task_work.func = io_poll_task_func; trace_io_uring_task_add(req, mask); io_req_task_work_add(req); @@ -394,6 +413,14 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, return 0; if (io_poll_get_ownership(req)) { + /* + * If we trigger a multishot poll off our own wakeup path, + * disable multishot as there is a circular dependency between + * CQ posting and triggering the event. + */ + if (mask & EPOLL_URING_WAKE) + poll->events |= EPOLLONESHOT; + /* optional, saves extra locking for removal in tw handler */ if (mask && poll->events & EPOLLONESHOT) { list_del_init(&poll->wait.entry); @@ -518,7 +545,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; - int v; INIT_HLIST_NODE(&req->hash_node); req->work.cancel_seq = atomic_read(&ctx->cancel_seq); @@ -586,11 +612,10 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (ipt->owning) { /* - * Release ownership. If someone tried to queue a tw while it was - * locked, kick it off for them. + * Try to release ownership. If we see a change of state, e.g. + * poll was waken up, queue up a tw, it'll deal with it. */ - v = atomic_dec_return(&req->poll_refs); - if (unlikely(v & IO_POLL_REF_MASK)) + if (atomic_cmpxchg(&req->poll_refs, 1, 0) != 1) __io_poll_execute(req, 0); } return 0; @@ -615,10 +640,13 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED) && - (entry = io_alloc_cache_get(&ctx->apoll_cache)) != NULL) { + } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { + entry = io_alloc_cache_get(&ctx->apoll_cache); + if (entry == NULL) + goto alloc_apoll; apoll = container_of(entry, struct async_poll, cache); } else { +alloc_apoll: apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return NULL; |