diff options
Diffstat (limited to 'fs/io_uring.c')
-rw-r--r-- | fs/io_uring.c | 1356 |
1 files changed, 957 insertions, 399 deletions
diff --git a/fs/io_uring.c b/fs/io_uring.c index ec53aa7cdc94..6f084e3cf835 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -145,7 +145,7 @@ struct io_rings { /* * Number of completion events lost because the queue was full; * this should be avoided by the application by making sure - * there are not more requests pending thatn there is space in + * there are not more requests pending than there is space in * the completion queue. * * Written by the kernel, shouldn't be modified by the @@ -238,7 +238,7 @@ struct io_ring_ctx { struct user_struct *user; - struct cred *creds; + const struct cred *creds; /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ struct completion *completions; @@ -275,7 +275,8 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; - struct rb_root cancel_tree; + struct hlist_head *cancel_hash; + unsigned cancel_hash_bits; spinlock_t inflight_lock; struct list_head inflight_list; @@ -288,11 +289,14 @@ struct io_ring_ctx { */ struct io_poll_iocb { struct file *file; - struct wait_queue_head *head; + union { + struct wait_queue_head *head; + u64 addr; + }; __poll_t events; bool done; bool canceled; - struct wait_queue_entry *wait; + struct wait_queue_entry wait; }; struct io_timeout_data { @@ -303,9 +307,57 @@ struct io_timeout_data { u32 seq_offset; }; +struct io_accept { + struct file *file; + struct sockaddr __user *addr; + int __user *addr_len; + int flags; +}; + +struct io_sync { + struct file *file; + loff_t len; + loff_t off; + int flags; +}; + +struct io_cancel { + struct file *file; + u64 addr; +}; + struct io_timeout { struct file *file; - struct io_timeout_data *data; + u64 addr; + int flags; +}; + +struct io_async_connect { + struct sockaddr_storage address; +}; + +struct io_async_msghdr { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + struct sockaddr __user *uaddr; + struct msghdr msg; +}; + +struct io_async_rw { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov; + ssize_t nr_segs; + ssize_t size; +}; + +struct io_async_ctx { + struct io_uring_sqe sqe; + union { + struct io_async_rw rw; + struct io_async_msghdr msg; + struct io_async_connect connect; + struct io_timeout_data timeout; + }; }; /* @@ -319,20 +371,25 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_accept accept; + struct io_sync sync; + struct io_cancel cancel; struct io_timeout timeout; }; const struct io_uring_sqe *sqe; + struct io_async_ctx *io; struct file *ring_file; int ring_fd; bool has_user; bool in_async; bool needs_fixed_file; + u8 opcode; struct io_ring_ctx *ctx; union { struct list_head list; - struct rb_node rb_node; + struct hlist_node hash_node; }; struct list_head link_list; unsigned int flags; @@ -353,7 +410,8 @@ struct io_kiocb { #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */ -#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */ +#define REQ_F_HARDLINK 65536 /* doesn't sever on completion < 0 */ +#define REQ_F_PREPPED 131072 /* request already opcode prepared */ u64 user_data; u32 result; u32 sequence; @@ -422,6 +480,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; + int hash_bits; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -435,6 +494,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) if (!ctx->completions) goto err; + /* + * Use 5 bits less than the max cq entries, that should give us around + * 32 entries per hash list if totally full and uniformly spread. + */ + hash_bits = ilog2(p->cq_entries); + hash_bits -= 5; + if (hash_bits <= 0) + hash_bits = 1; + ctx->cancel_hash_bits = hash_bits; + ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head), + GFP_KERNEL); + if (!ctx->cancel_hash) + goto err; + __hash_init(ctx->cancel_hash, 1U << hash_bits); + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto err; @@ -448,7 +522,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); - ctx->cancel_tree = RB_ROOT; INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); init_waitqueue_head(&ctx->inflight_wait); @@ -459,6 +532,7 @@ err: if (ctx->fallback_req) kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx->completions); + kfree(ctx->cancel_hash); kfree(ctx); return NULL; } @@ -524,12 +598,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) } } -static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +static inline bool io_req_needs_user(struct io_kiocb *req) { - u8 opcode = READ_ONCE(sqe->opcode); - - return !(opcode == IORING_OP_READ_FIXED || - opcode == IORING_OP_WRITE_FIXED); + return !(req->opcode == IORING_OP_READ_FIXED || + req->opcode == IORING_OP_WRITE_FIXED); } static inline bool io_prep_async_work(struct io_kiocb *req, @@ -538,10 +610,12 @@ static inline bool io_prep_async_work(struct io_kiocb *req, bool do_hashed = false; if (req->sqe) { - switch (req->sqe->opcode) { + switch (req->opcode) { case IORING_OP_WRITEV: case IORING_OP_WRITE_FIXED: - do_hashed = true; + /* only regular files should be hashed for writes */ + if (req->flags & REQ_F_ISREG) + do_hashed = true; /* fall-through */ case IORING_OP_READV: case IORING_OP_READ_FIXED: @@ -559,7 +633,7 @@ static inline bool io_prep_async_work(struct io_kiocb *req, req->work.flags |= IO_WQ_WORK_UNBOUND; break; } - if (io_sqe_needs_user(req->sqe)) + if (io_req_needs_user(req)) req->work.flags |= IO_WQ_WORK_NEEDS_USER; } @@ -592,7 +666,7 @@ static void io_kill_timeout(struct io_kiocb *req) { int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { atomic_inc(&req->ctx->cq_timeouts); list_del_init(&req->list); @@ -806,6 +880,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, } got_it: + req->io = NULL; req->ring_file = NULL; req->file = NULL; req->ctx = ctx; @@ -836,8 +911,8 @@ static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - if (req->flags & REQ_F_FREE_SQE) - kfree(req->sqe); + if (req->io) + kfree(req->io); if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); if (req->flags & REQ_F_INFLIGHT) { @@ -849,8 +924,6 @@ static void __io_free_req(struct io_kiocb *req) wake_up(&ctx->inflight_wait); spin_unlock_irqrestore(&ctx->inflight_lock, flags); } - if (req->flags & REQ_F_TIMEOUT) - kfree(req->timeout.data); percpu_ref_put(&ctx->refs); if (likely(!io_is_fallback_req(req))) kmem_cache_free(req_cachep, req); @@ -863,7 +936,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; int ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret != -1) { io_cqring_fill_event(req, -ECANCELED); io_commit_cqring(ctx); @@ -878,7 +951,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *nxt; bool wake_ev = false; /* Already got next link */ @@ -890,24 +962,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) * potentially happen if the chain is messed up, check to be on the * safe side. */ - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - while (nxt) { - list_del_init(&nxt->list); + while (!list_empty(&req->link_list)) { + struct io_kiocb *nxt = list_first_entry(&req->link_list, + struct io_kiocb, link_list); - if ((req->flags & REQ_F_LINK_TIMEOUT) && - (nxt->flags & REQ_F_TIMEOUT)) { + if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) && + (nxt->flags & REQ_F_TIMEOUT))) { + list_del_init(&nxt->link_list); wake_ev |= io_link_cancel_timeout(nxt); - nxt = list_first_entry_or_null(&req->link_list, - struct io_kiocb, list); req->flags &= ~REQ_F_LINK_TIMEOUT; continue; } - if (!list_empty(&req->link_list)) { - INIT_LIST_HEAD(&nxt->link_list); - list_splice(&req->link_list, &nxt->link_list); - nxt->flags |= REQ_F_LINK; - } + list_del_init(&req->link_list); + if (!list_empty(&nxt->link_list)) + nxt->flags |= REQ_F_LINK; *nxtptr = nxt; break; } @@ -923,19 +992,19 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_fail_links(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct io_kiocb *link; unsigned long flags; spin_lock_irqsave(&ctx->completion_lock, flags); while (!list_empty(&req->link_list)) { - link = list_first_entry(&req->link_list, struct io_kiocb, list); - list_del_init(&link->list); + struct io_kiocb *link = list_first_entry(&req->link_list, + struct io_kiocb, link_list); + list_del_init(&link->link_list); trace_io_uring_fail_link(req, link); if ((req->flags & REQ_F_LINK_TIMEOUT) && - link->sqe->opcode == IORING_OP_LINK_TIMEOUT) { + link->opcode == IORING_OP_LINK_TIMEOUT) { io_link_cancel_timeout(link); } else { io_cqring_fill_event(link, -ECANCELED); @@ -1079,9 +1148,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * completions for those, only batch free for fixed * file and non-linked commands. */ - if (((req->flags & - (REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == - REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { + if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) == + REQ_F_FIXED_FILE) && !io_is_fallback_req(req) && + !req->io) { reqs[to_free++] = req; if (to_free == ARRAY_SIZE(reqs)) io_free_req_many(ctx, reqs, &to_free); @@ -1141,7 +1210,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, } /* - * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */ @@ -1258,6 +1327,12 @@ static void kiocb_end_write(struct io_kiocb *req) file_end_write(req->file); } +static inline void req_set_fail_links(struct io_kiocb *req) +{ + if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) + req->flags |= REQ_F_FAIL_LINK; +} + static void io_complete_rw_common(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); @@ -1265,8 +1340,8 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); io_cqring_add_event(req, res); } @@ -1296,8 +1371,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); - if ((req->flags & REQ_F_LINK) && res != req->result) - req->flags |= REQ_F_FAIL_LINK; + if (res != req->result) + req_set_fail_links(req); req->result = res; if (res != -EAGAIN) req->flags |= REQ_F_IOPOLL_COMPLETED; @@ -1388,7 +1463,7 @@ static bool io_file_supports_async(struct file *file) { umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode) || S_ISCHR(mode)) + if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode)) return true; if (S_ISREG(mode) && file->f_op != &io_uring_fops) return true; @@ -1410,15 +1485,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) if (S_ISREG(file_inode(req->file)->i_mode)) req->flags |= REQ_F_ISREG; - /* - * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so - * we know to async punt it even if it was opened O_NONBLOCK - */ - if (force_nonblock && !io_file_supports_async(req->file)) { - req->flags |= REQ_F_MUST_PUNT; - return -EAGAIN; - } - kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -1581,12 +1647,22 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, * for that purpose and instead let the caller pass in the read/write * flag. */ - opcode = READ_ONCE(sqe->opcode); + opcode = req->opcode; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { *iovec = NULL; return io_import_fixed(req->ctx, rw, sqe, iter); } + if (req->io) { + struct io_async_rw *iorw = &req->io->rw; + + *iovec = iorw->iov; + iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size); + if (iorw->iov == iorw->fast_iov) + *iovec = NULL; + return iorw->size; + } + if (!req->has_user) return -EFAULT; @@ -1657,6 +1733,70 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, return ret; } +static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + req->io->rw.nr_segs = iter->nr_segs; + req->io->rw.size = io_size; + req->io->rw.iov = iovec; + if (!req->io->rw.iov) { + req->io->rw.iov = req->io->rw.fast_iov; + memcpy(req->io->rw.iov, fast_iov, + sizeof(struct iovec) * iter->nr_segs); + } +} + +static int io_alloc_async_ctx(struct io_kiocb *req) +{ + req->io = kmalloc(sizeof(*req->io), GFP_KERNEL); + if (req->io) { + memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe)); + req->sqe = &req->io->sqe; + return 0; + } + + return 1; +} + +static void io_rw_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; + + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->rw.iov; + io_wq_submit_work(workptr); + kfree(iov); +} + +static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size, + struct iovec *iovec, struct iovec *fast_iov, + struct iov_iter *iter) +{ + if (!req->io && io_alloc_async_ctx(req)) + return -ENOMEM; + + io_req_map_rw(req, io_size, iovec, fast_iov, iter); + req->work.func = io_rw_async; + return 0; +} + +static int io_read_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_READ))) + return -EBADF; + + return io_import_iovec(READ, req, iovec, iter); +} + static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1665,23 +1805,35 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t read_size, ret; + ssize_t io_size, ret; - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; - file = kiocb->ki_filp; - - if (unlikely(!(file->f_mode & FMODE_READ))) - return -EBADF; + if (!req->io) { + ret = io_read_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(READ, req, &iovec, &iter); + if (ret < 0) + return ret; + } - ret = io_import_iovec(READ, req, &iovec, &iter); - if (ret < 0) - return ret; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; - read_size = ret; + file = req->file; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = read_size; + req->result = io_size; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } iov_count = iov_iter_count(&iter); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); @@ -1703,18 +1855,41 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, */ if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && (req->flags & REQ_F_ISREG) && - ret2 > 0 && ret2 < read_size) + ret2 > 0 && ret2 < io_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_rw(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } - kfree(iovec); +out_free: + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } +static int io_write_prep(struct io_kiocb *req, struct iovec **iovec, + struct iov_iter *iter, bool force_nonblock) +{ + ssize_t ret; + + ret = io_prep_rw(req, force_nonblock); + if (ret) + return ret; + + if (unlikely(!(req->file->f_mode & FMODE_WRITE))) + return -EBADF; + + return io_import_iovec(WRITE, req, iovec, iter); +} + static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { @@ -1723,29 +1898,42 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, struct iov_iter iter; struct file *file; size_t iov_count; - ssize_t ret; - - ret = io_prep_rw(req, force_nonblock); - if (ret) - return ret; + ssize_t ret, io_size; - file = kiocb->ki_filp; - if (unlikely(!(file->f_mode & FMODE_WRITE))) - return -EBADF; + if (!req->io) { + ret = io_write_prep(req, &iovec, &iter, force_nonblock); + if (ret < 0) + return ret; + } else { + ret = io_import_iovec(WRITE, req, &iovec, &iter); + if (ret < 0) + return ret; + } - ret = io_import_iovec(WRITE, req, &iovec, &iter); - if (ret < 0) - return ret; + /* Ensure we clear previously set non-block flag */ + if (!force_nonblock) + req->rw.ki_flags &= ~IOCB_NOWAIT; + file = kiocb->ki_filp; + io_size = ret; if (req->flags & REQ_F_LINK) - req->result = ret; + req->result = io_size; - iov_count = iov_iter_count(&iter); + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + goto copy_iov; + } - ret = -EAGAIN; - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) - goto out_free; + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; + iov_count = iov_iter_count(&iter); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); if (!ret) { ssize_t ret2; @@ -1769,13 +1957,20 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ret2 = call_write_iter(file, kiocb, &iter); else ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); - if (!force_nonblock || ret2 != -EAGAIN) + if (!force_nonblock || ret2 != -EAGAIN) { kiocb_done(kiocb, ret2, nxt, req->in_async); - else - ret = -EAGAIN; + } else { +copy_iov: + ret = io_setup_async_rw(req, io_size, iovec, + inline_vecs, &iter); + if (ret) + goto out_free; + return -EAGAIN; + } } out_free: - kfree(iovec); + if (!io_wq_current_is_worker()) + kfree(iovec); return ret; } @@ -1794,10 +1989,13 @@ static int io_nop(struct io_kiocb *req) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_fsync(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1806,46 +2004,80 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; + req->sync.flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(req->sync.flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->flags |= REQ_F_PREPPED; return 0; } -static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static bool io_req_cancelled(struct io_kiocb *req) +{ + if (req->work.flags & IO_WQ_WORK_CANCEL) { + req_set_fail_links(req); + io_cqring_add_event(req, -ECANCELED); + io_put_req(req); + return true; + } + + return false; +} + +static void io_fsync_finish(struct io_wq_work **workptr) { - loff_t sqe_off = READ_ONCE(sqe->off); - loff_t sqe_len = READ_ONCE(sqe->len); - loff_t end = sqe_off + sqe_len; - unsigned fsync_flags; + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + loff_t end = req->sync.off + req->sync.len; + struct io_kiocb *nxt = NULL; int ret; - fsync_flags = READ_ONCE(sqe->fsync_flags); - if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) - return -EINVAL; + if (io_req_cancelled(req)) + return; + + ret = vfs_fsync_range(req->rw.ki_filp, req->sync.off, + end > 0 ? end : LLONG_MAX, + req->sync.flags & IORING_FSYNC_DATASYNC); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; +} - ret = io_prep_fsync(req, sqe); +static int io_fsync(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_wq_work *work, *old_work; + int ret; + + ret = io_prep_fsync(req); if (ret) return ret; /* fsync always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_fsync_finish; return -EAGAIN; + } - ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, - end > 0 ? end : LLONG_MAX, - fsync_flags & IORING_FSYNC_DATASYNC); - - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); + work = old_work = &req->work; + io_fsync_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_prep_sfr(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_ring_ctx *ctx = req->ctx; - int ret = 0; + if (req->flags & REQ_F_PREPPED) + return 0; if (!req->file) return -EBADF; @@ -1854,46 +2086,91 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; - return ret; + req->sync.off = READ_ONCE(sqe->off); + req->sync.len = READ_ONCE(sqe->len); + req->sync.flags = READ_ONCE(sqe->sync_range_flags); + req->flags |= REQ_F_PREPPED; + return 0; +} + +static void io_sync_file_range_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + int ret; + + if (io_req_cancelled(req)) + return; + + ret = sync_file_range(req->rw.ki_filp, req->sync.off, req->sync.len, + req->sync.flags); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req_find_next(req, &nxt); + if (nxt) + *workptr = &nxt->work; } -static int io_sync_file_range(struct io_kiocb *req, - const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, +static int io_sync_file_range(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - loff_t sqe_off; - loff_t sqe_len; - unsigned flags; + struct io_wq_work *work, *old_work; int ret; - ret = io_prep_sfr(req, sqe); + ret = io_prep_sfr(req); if (ret) return ret; /* sync_file_range always requires a blocking context */ - if (force_nonblock) + if (force_nonblock) { + io_put_req(req); + req->work.func = io_sync_file_range_finish; return -EAGAIN; + } + + work = old_work = &req->work; + io_sync_file_range_finish(&work); + if (work && work != old_work) + *nxt = container_of(work, struct io_kiocb, work); + return 0; +} - sqe_off = READ_ONCE(sqe->off); - sqe_len = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->sync_range_flags); +#if defined(CONFIG_NET) +static void io_sendrecv_async(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct iovec *iov = NULL; - ret = sync_file_range(req->rw.ki_filp, sqe_off, sqe_len, flags); + if (req->io->rw.iov != req->io->rw.fast_iov) + iov = req->io->msg.iov; + io_wq_submit_work(workptr); + kfree(iov); +} +#endif - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; - io_cqring_add_event(req, ret); - io_put_req_find_next(req, nxt); +static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; + return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov); +#else return 0; +#endif } -#if defined(CONFIG_NET) -static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock, - long (*fn)(struct socket *, struct user_msghdr __user *, - unsigned int)) +static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; struct socket *sock; int ret; @@ -1902,7 +2179,8 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, sock = sock_from_file(req->file, &ret); if (sock) { - struct user_msghdr __user *msg; + struct io_async_ctx io; + struct sockaddr_storage addr; unsigned flags; flags = READ_ONCE(sqe->msg_flags); @@ -1911,85 +2189,238 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, else if (force_nonblock) flags |= MSG_DONTWAIT; - msg = (struct user_msghdr __user *) (unsigned long) - READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + ret = io_sendmsg_prep(req, &io); + if (ret) + goto out; + } - ret = fn(sock, msg, flags); - if (force_nonblock && ret == -EAGAIN) - return ret; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + if (force_nonblock && ret == -EAGAIN) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; } +out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); return 0; -} +#else + return -EOPNOTSUPP; #endif +} -static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_sendmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct user_msghdr __user *msg; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr); + io->msg.iov = io->msg.fast_iov; + return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr, + &io->msg.iov); #else - return -EOPNOTSUPP; + return 0; #endif } -static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) { #if defined(CONFIG_NET) - return io_send_recvmsg(req, sqe, nxt, force_nonblock, - __sys_recvmsg_sock); + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_msghdr *kmsg = NULL; + struct socket *sock; + int ret; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + sock = sock_from_file(req->file, &ret); + if (sock) { + struct user_msghdr __user *msg; + struct io_async_ctx io; + struct sockaddr_storage addr; + unsigned flags; + + flags = READ_ONCE(sqe->msg_flags); + if (flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + else if (force_nonblock) + flags |= MSG_DONTWAIT; + + msg = (struct user_msghdr __user *) (unsigned long) + READ_ONCE(sqe->addr); + if (req->io) { + kmsg = &req->io->msg; + kmsg->msg.msg_name = &addr; + /* if iov is set, it's allocated already */ + if (!kmsg->iov) + kmsg->iov = kmsg->fast_iov; + kmsg->msg.msg_iter.iov = kmsg->iov; + } else { + kmsg = &io.msg; + kmsg->msg.msg_name = &addr; + ret = io_recvmsg_prep(req, &io); + if (ret) + goto out; + } + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, msg, kmsg->uaddr, flags); + if (force_nonblock && ret == -EAGAIN) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) + return -ENOMEM; + memcpy(&req->io->msg, &io.msg, sizeof(io.msg)); + req->work.func = io_sendrecv_async; + return -EAGAIN; + } + if (ret == -ERESTARTSYS) + ret = -EINTR; + } + +out: + if (!io_wq_current_is_worker() && kmsg && kmsg->iov != kmsg->fast_iov) + kfree(kmsg->iov); + io_cqring_add_event(req, ret); + if (ret < 0) + req_set_fail_links(req); + io_put_req_find_next(req, nxt); + return 0; #else return -EOPNOTSUPP; #endif } -static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_accept_prep(struct io_kiocb *req) { #if defined(CONFIG_NET) - struct sockaddr __user *addr; - int __user *addr_len; - unsigned file_flags; - int flags, ret; + const struct io_uring_sqe *sqe = req->sqe; + struct io_accept *accept = &req->accept; + + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & (IORING_SETUP_IOPOLL|IORING_SETUP_SQPOLL))) return -EINVAL; if (sqe->ioprio || sqe->len || sqe->buf_index) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); - addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); - flags = READ_ONCE(sqe->accept_flags); - file_flags = force_nonblock ? O_NONBLOCK : 0; + accept->addr = (struct sockaddr __user *) + (unsigned long) READ_ONCE(sqe->addr); + accept->addr_len = (int __user *) (unsigned long) READ_ONCE(sqe->addr2); + accept->flags = READ_ONCE(sqe->accept_flags); + req->flags |= REQ_F_PREPPED; + return 0; +#else + return -EOPNOTSUPP; +#endif +} - ret = __sys_accept4_file(req->file, file_flags, addr, addr_len, flags); - if (ret == -EAGAIN && force_nonblock) { - req->work.flags |= IO_WQ_WORK_NEEDS_FILES; +#if defined(CONFIG_NET) +static int __io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ + struct io_accept *accept = &req->accept; + unsigned file_flags; + int ret; + + file_flags = force_nonblock ? O_NONBLOCK : 0; + ret = __sys_accept4_file(req->file, file_flags, accept->addr, + accept->addr_len, accept->flags); + if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; +} + +static void io_accept_finish(struct io_wq_work **workptr) +{ + struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); + struct io_kiocb *nxt = NULL; + + if (io_req_cancelled(req)) + return; + __io_accept(req, &nxt, false); + if (nxt) + *workptr = &nxt->work; +} +#endif + +static int io_accept(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + int ret; + + ret = io_accept_prep(req); + if (ret) + return ret; + + ret = __io_accept(req, nxt, force_nonblock); + if (ret == -EAGAIN && force_nonblock) { + req->work.func = io_accept_finish; + req->work.flags |= IO_WQ_WORK_NEEDS_FILES; + io_put_req(req); + return -EAGAIN; + } + return 0; #else return -EOPNOTSUPP; #endif } -static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt, bool force_nonblock) +static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io) { #if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; struct sockaddr __user *addr; + int addr_len; + + addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); + addr_len = READ_ONCE(sqe->addr2); + return move_addr_to_kernel(addr, addr_len, &io->connect.address); +#else + return 0; +#endif +} + +static int io_connect(struct io_kiocb *req, struct io_kiocb **nxt, + bool force_nonblock) +{ +#if defined(CONFIG_NET) + const struct io_uring_sqe *sqe = req->sqe; + struct io_async_ctx __io, *io; unsigned file_flags; int addr_len, ret; @@ -1998,17 +2429,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) return -EINVAL; - addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr); addr_len = READ_ONCE(sqe->addr2); file_flags = force_nonblock ? O_NONBLOCK : 0; - ret = __sys_connect_file(req->file, addr, addr_len, file_flags); - if (ret == -EAGAIN && force_nonblock) + if (req->io) { + io = req->io; + } else { + ret = io_connect_prep(req, &__io); + if (ret) + goto out; + io = &__io; + } + + ret = __sys_connect_file(req->file, &io->connect.address, addr_len, + file_flags); + if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { + if (req->io) + return -EAGAIN; + if (io_alloc_async_ctx(req)) { + ret = -ENOMEM; + goto out; + } + memcpy(&req->io->connect, &__io.connect, sizeof(__io.connect)); return -EAGAIN; + } if (ret == -ERESTARTSYS) ret = -EINTR; - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; +out: + if (ret < 0) + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req_find_next(req, nxt); return 0; @@ -2017,55 +2466,45 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, #endif } -static inline void io_poll_remove_req(struct io_kiocb *req) -{ - if (!RB_EMPTY_NODE(&req->rb_node)) { - rb_erase(&req->rb_node, &req->ctx->cancel_tree); - RB_CLEAR_NODE(&req->rb_node); - } -} - static void io_poll_remove_one(struct io_kiocb *req) { struct io_poll_iocb *poll = &req->poll; spin_lock(&poll->head->lock); WRITE_ONCE(poll->canceled, true); - if (!list_empty(&poll->wait->entry)) { - list_del_init(&poll->wait->entry); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); io_queue_async_work(req); } spin_unlock(&poll->head->lock); - io_poll_remove_req(req); + hash_del(&req->hash_node); } static void io_poll_remove_all(struct io_ring_ctx *ctx) { - struct rb_node *node; + struct hlist_node *tmp; struct io_kiocb *req; + int i; spin_lock_irq(&ctx->completion_lock); - while ((node = rb_first(&ctx->cancel_tree)) != NULL) { - req = rb_entry(node, struct io_kiocb, rb_node); - io_poll_remove_one(req); + for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { + struct hlist_head *list; + + list = &ctx->cancel_hash[i]; + hlist_for_each_entry_safe(req, tmp, list, hash_node) + io_poll_remove_one(req); } spin_unlock_irq(&ctx->completion_lock); } static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) { - struct rb_node *p, *parent = NULL; + struct hlist_head *list; struct io_kiocb *req; - p = ctx->cancel_tree.rb_node; - while (p) { - parent = p; - req = rb_entry(parent, struct io_kiocb, rb_node); - if (sqe_addr < req->user_data) { - p = p->rb_left; - } else if (sqe_addr > req->user_data) { - p = p->rb_right; - } else { + list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)]; + hlist_for_each_entry(req, list, hash_node) { + if (sqe_addr == req->user_data) { io_poll_remove_one(req); return 0; } @@ -2074,28 +2513,45 @@ static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) return -ENOENT; } +static int io_poll_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + req->poll.addr = READ_ONCE(sqe->addr); + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Find a running poll command that matches one specified in sqe->addr, * and remove it if found. */ -static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_poll_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + u64 addr; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || - sqe->poll_events) - return -EINVAL; + ret = io_poll_remove_prep(req); + if (ret) + return ret; + addr = req->poll.addr; spin_lock_irq(&ctx->completion_lock); - ret = io_poll_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_poll_cancel(ctx, addr); spin_unlock_irq(&ctx->completion_lock); io_cqring_add_event(req, ret); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } @@ -2105,7 +2561,6 @@ static void io_poll_complete(struct io_kiocb *req, __poll_t mask, int error) struct io_ring_ctx *ctx = req->ctx; req->poll.done = true; - kfree(req->poll.wait); if (error) io_cqring_fill_event(req, error); else @@ -2143,18 +2598,18 @@ static void io_poll_complete_work(struct io_wq_work **workptr) */ spin_lock_irq(&ctx->completion_lock); if (!mask && ret != -ECANCELED) { - add_wait_queue(poll->head, poll->wait); + add_wait_queue(poll->head, &poll->wait); spin_unlock_irq(&ctx->completion_lock); return; } - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, ret); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, &nxt); if (nxt) *workptr = &nxt->work; @@ -2173,7 +2628,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, if (mask && !(mask & poll->events)) return 0; - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); /* * Run completion inline if we can. We're using trylock here because @@ -2182,7 +2637,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, * for finalizing the request, mark us as having grabbed that already. */ if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { - io_poll_remove_req(req); + hash_del(&req->hash_node); io_poll_complete(req, mask, 0); req->flags |= REQ_F_COMP_LOCKED; io_put_req(req); @@ -2214,38 +2669,26 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, pt->error = 0; pt->req->poll.head = head; - add_wait_queue(head, pt->req->poll.wait); + add_wait_queue(head, &pt->req->poll.wait); } static void io_poll_req_insert(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - struct rb_node **p = &ctx->cancel_tree.rb_node; - struct rb_node *parent = NULL; - struct io_kiocb *tmp; - - while (*p) { - parent = *p; - tmp = rb_entry(parent, struct io_kiocb, rb_node); - if (req->user_data < tmp->user_data) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&req->rb_node, parent, p); - rb_insert_color(&req->rb_node, &ctx->cancel_tree); + struct hlist_head *list; + + list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)]; + hlist_add_head(&req->hash_node, list); } -static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_poll_add_prep(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; struct io_poll_iocb *poll = &req->poll; - struct io_ring_ctx *ctx = req->ctx; - struct io_poll_table ipt; - bool cancel = false; - __poll_t mask; u16 events; + if (req->flags & REQ_F_PREPPED) + return 0; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) @@ -2253,15 +2696,27 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (!poll->file) return -EBADF; - poll->wait = kmalloc(sizeof(*poll->wait), GFP_KERNEL); - if (!poll->wait) - return -ENOMEM; - - req->sqe = NULL; - INIT_IO_WORK(&req->work, io_poll_complete_work); + req->flags |= REQ_F_PREPPED; events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; - RB_CLEAR_NODE(&req->rb_node); + return 0; +} + +static int io_poll_add(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + bool cancel = false; + __poll_t mask; + int ret; + + ret = io_poll_add_prep(req); + if (ret) + return ret; + + INIT_IO_WORK(&req->work, io_poll_complete_work); + INIT_HLIST_NODE(&req->hash_node); poll->head = NULL; poll->done = false; @@ -2273,9 +2728,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ /* initialized the list so that we can do list_empty checks */ - INIT_LIST_HEAD(&poll->wait->entry); - init_waitqueue_func_entry(poll->wait, io_poll_wake); - poll->wait->private = poll; + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + poll->wait.private = poll; INIT_LIST_HEAD(&req->list); @@ -2284,14 +2739,14 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, spin_lock_irq(&ctx->completion_lock); if (likely(poll->head)) { spin_lock(&poll->head->lock); - if (unlikely(list_empty(&poll->wait->entry))) { + if (unlikely(list_empty(&poll->wait.entry))) { if (ipt.error) cancel = true; ipt.error = 0; mask = 0; } if (mask || ipt.error) - list_del_init(&poll->wait->entry); + list_del_init(&poll->wait.entry); else if (cancel) WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ @@ -2331,7 +2786,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) /* * Adjust the reqs sequence before the current one because it - * will consume a slot in the cq_ring and the the cq_tail + * will consume a slot in the cq_ring and the cq_tail * pointer will be increased, otherwise other timeout reqs may * return in advance without waiting for enough wait_nr. */ @@ -2346,8 +2801,7 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); return HRTIMER_NORESTART; } @@ -2368,49 +2822,63 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) if (ret == -ENOENT) return ret; - ret = hrtimer_try_to_cancel(&req->timeout.data->timer); + ret = hrtimer_try_to_cancel(&req->io->timeout.timer); if (ret == -1) return -EALREADY; - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_fill_event(req, -ECANCELED); io_put_req(req); return 0; } +static int io_timeout_remove_prep(struct io_kiocb *req) +{ + const struct io_uring_sqe *sqe = req->sqe; + + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) + return -EINVAL; + + req->timeout.addr = READ_ONCE(sqe->addr); + req->timeout.flags = READ_ONCE(sqe->timeout_flags); + if (req->timeout.flags) + return -EINVAL; + + req->flags |= REQ_F_PREPPED; + return 0; +} + /* * Remove or update an existing timeout command */ -static int io_timeout_remove(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_timeout_remove(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - unsigned flags; int ret; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->len) - return -EINVAL; - flags = READ_ONCE(sqe->timeout_flags); - if (flags) - return -EINVAL; + ret = io_timeout_remove_prep(req); + if (ret) + return ret; spin_lock_irq(&ctx->completion_lock); - ret = io_timeout_cancel(ctx, READ_ONCE(sqe->addr)); + ret = io_timeout_cancel(ctx, req->timeout.addr); io_cqring_fill_event(req, ret); io_commit_cqring(ctx); spin_unlock_irq(&ctx->completion_lock); io_cqring_ev_posted(ctx); - if (ret < 0 && req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req(req); return 0; } -static int io_timeout_setup(struct io_kiocb *req) +static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io, + bool is_timeout_link) { const struct io_uring_sqe *sqe = req->sqe; struct io_timeout_data *data; @@ -2420,15 +2888,14 @@ static int io_timeout_setup(struct io_kiocb *req) return -EINVAL; if (sqe->ioprio || sqe->buf_index || sqe->len != 1) return -EINVAL; + if (sqe->off && is_timeout_link) + return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); if (flags & ~IORING_TIMEOUT_ABS) return -EINVAL; - data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); - if (!data) - return -ENOMEM; + data = &io->timeout; data->req = req; - req->timeout.data = data; req->flags |= REQ_F_TIMEOUT; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) @@ -2443,8 +2910,9 @@ static int io_timeout_setup(struct io_kiocb *req) return 0; } -static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_timeout(struct io_kiocb *req) { + const struct io_uring_sqe *sqe = req->sqe; unsigned count; struct io_ring_ctx *ctx = req->ctx; struct io_timeout_data *data; @@ -2452,12 +2920,14 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned span = 0; int ret; - ret = io_timeout_setup(req); - /* common setup allows flags (like links) set, we don't */ - if (!ret && sqe->flags) - ret = -EINVAL; - if (ret) - return ret; + if (!req->io) { + if (io_alloc_async_ctx(req)) + return -ENOMEM; + ret = io_timeout_prep(req, req->io, false); + if (ret) + return ret; + } + data = &req->io->timeout; /* * sqe->off holds how many events that need to occur for this @@ -2473,7 +2943,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) } req->sequence = ctx->cached_sq_head + count - 1; - req->timeout.data->seq_offset = count; + data->seq_offset = count; /* * Insertion sort, ensuring the first entry in the list is always @@ -2484,7 +2954,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); unsigned nxt_sq_head; long long tmp, tmp_nxt; - u32 nxt_offset = nxt->timeout.data->seq_offset; + u32 nxt_offset = nxt->io->timeout.seq_offset; if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) continue; @@ -2517,7 +2987,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->sequence -= span; add: list_add(&req->list, entry); - data = req->timeout.data; data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); spin_unlock_irq(&ctx->completion_lock); @@ -2578,50 +3047,141 @@ done: spin_unlock_irqrestore(&ctx->completion_lock, flags); io_cqring_ev_posted(ctx); - if (ret < 0 && (req->flags & REQ_F_LINK)) - req->flags |= REQ_F_FAIL_LINK; + if (ret < 0) + req_set_fail_links(req); io_put_req_find_next(req, nxt); } -static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_kiocb **nxt) +static int io_async_cancel_prep(struct io_kiocb *req) { - struct io_ring_ctx *ctx = req->ctx; + const struct io_uring_sqe *sqe = req->sqe; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + if (req->flags & REQ_F_PREPPED) + return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->flags || sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) return -EINVAL; - io_async_find_and_cancel(ctx, req, READ_ONCE(sqe->addr), nxt, 0); + req->flags |= REQ_F_PREPPED; + req->cancel.addr = READ_ONCE(sqe->addr); + return 0; +} + +static int io_async_cancel(struct io_kiocb *req, struct io_kiocb **nxt) +{ + struct io_ring_ctx *ctx = req->ctx; + int ret; + + ret = io_async_cancel_prep(req); + if (ret) + return ret; + + io_async_find_and_cancel(ctx, req, req->cancel.addr, nxt, 0); return 0; } +static int io_req_defer_prep(struct io_kiocb *req) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_async_ctx *io = req->io; + struct iov_iter iter; + ssize_t ret = 0; + + switch (req->opcode) { + case IORING_OP_NOP: + break; + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + /* ensure prep does right import */ + req->io = NULL; + ret = io_read_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; + break; + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + /* ensure prep does right import */ + req->io = NULL; + ret = io_write_prep(req, &iovec, &iter, true); + req->io = io; + if (ret < 0) + break; + io_req_map_rw(req, ret, iovec, inline_vecs, &iter); + ret = 0; + break; + case IORING_OP_POLL_ADD: + ret = io_poll_add_prep(req); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove_prep(req); + break; + case IORING_OP_FSYNC: + ret = io_prep_fsync(req); + break; + case IORING_OP_SYNC_FILE_RANGE: + ret = io_prep_sfr(req); + break; + case IORING_OP_SENDMSG: + ret = io_sendmsg_prep(req, io); + break; + case IORING_OP_RECVMSG: + ret = io_recvmsg_prep(req, io); + break; + case IORING_OP_CONNECT: + ret = io_connect_prep(req, io); + break; + case IORING_OP_TIMEOUT: + ret = io_timeout_prep(req, io, false); + break; + case IORING_OP_TIMEOUT_REMOVE: + ret = io_timeout_remove_prep(req); + break; + case IORING_OP_ASYNC_CANCEL: + ret = io_async_cancel_prep(req); + break; + case IORING_OP_LINK_TIMEOUT: + ret = io_timeout_prep(req, io, true); + break; + case IORING_OP_ACCEPT: + ret = io_accept_prep(req); + break; + default: + printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", + req->opcode); + ret = -EINVAL; + break; + } + + return ret; +} + static int io_req_defer(struct io_kiocb *req) { - struct io_uring_sqe *sqe_copy; struct io_ring_ctx *ctx = req->ctx; + int ret; /* Still need defer if there is pending req in defer list. */ if (!req_need_defer(req) && list_empty(&ctx->defer_list)) return 0; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) + if (io_alloc_async_ctx(req)) return -EAGAIN; + ret = io_req_defer_prep(req); + if (ret < 0) + return ret; + spin_lock_irq(&ctx->completion_lock); if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { spin_unlock_irq(&ctx->completion_lock); - kfree(sqe_copy); return 0; } - memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy)); - req->flags |= REQ_F_FREE_SQE; - req->sqe = sqe_copy; - trace_io_uring_defer(ctx, req, req->user_data); list_add_tail(&req->list, &ctx->defer_list); spin_unlock_irq(&ctx->completion_lock); @@ -2632,11 +3192,10 @@ __attribute__((nonnull)) static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, bool force_nonblock) { - int ret, opcode; struct io_ring_ctx *ctx = req->ctx; + int ret; - opcode = READ_ONCE(req->sqe->opcode); - switch (opcode) { + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req); break; @@ -2657,37 +3216,37 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, ret = io_write(req, nxt, force_nonblock); break; case IORING_OP_FSYNC: - ret = io_fsync(req, req->sqe, nxt, force_nonblock); + ret = io_fsync(req, nxt, force_nonblock); break; case IORING_OP_POLL_ADD: - ret = io_poll_add(req, req->sqe, nxt); + ret = io_poll_add(req, nxt); break; case IORING_OP_POLL_REMOVE: - ret = io_poll_remove(req, req->sqe); + ret = io_poll_remove(req); break; case IORING_OP_SYNC_FILE_RANGE: - ret = io_sync_file_range(req, req->sqe, nxt, force_nonblock); + ret = io_sync_file_range(req, nxt, force_nonblock); break; case IORING_OP_SENDMSG: - ret = io_sendmsg(req, req->sqe, nxt, force_nonblock); + ret = io_sendmsg(req, nxt, force_nonblock); break; case IORING_OP_RECVMSG: - ret = io_recvmsg(req, req->sqe, nxt, force_nonblock); + ret = io_recvmsg(req, nxt, force_nonblock); break; case IORING_OP_TIMEOUT: - ret = io_timeout(req, req->sqe); + ret = io_timeout(req); break; case IORING_OP_TIMEOUT_REMOVE: - ret = io_timeout_remove(req, req->sqe); + ret = io_timeout_remove(req); break; case IORING_OP_ACCEPT: - ret = io_accept(req, req->sqe, nxt, force_nonblock); + ret = io_accept(req, nxt, force_nonblock); break; case IORING_OP_CONNECT: - ret = io_connect(req, req->sqe, nxt, force_nonblock); + ret = io_connect(req, nxt, force_nonblock); break; case IORING_OP_ASYNC_CANCEL: - ret = io_async_cancel(req, req->sqe, nxt); + ret = io_async_cancel(req, nxt); break; default: ret = -EINVAL; @@ -2701,12 +3260,7 @@ static int io_issue_sqe(struct io_kiocb *req, struct io_kiocb **nxt, if (req->result == -EAGAIN) return -EAGAIN; - /* workqueue context doesn't hold uring_lock, grab it now */ - if (req->in_async) - mutex_lock(&ctx->uring_lock); io_iopoll_req_issued(req); - if (req->in_async) - mutex_unlock(&ctx->uring_lock); } return 0; @@ -2728,9 +3282,6 @@ static void io_wq_submit_work(struct io_wq_work **workptr) struct io_kiocb *nxt = NULL; int ret = 0; - /* Ensure we clear previously set non-block flag */ - req->rw.ki_flags &= ~IOCB_NOWAIT; - if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -2754,8 +3305,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr) io_put_req(req); if (ret) { - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); } @@ -2774,20 +3324,25 @@ static void io_wq_submit_work(struct io_wq_work **workptr) } } -static bool io_op_needs_file(const struct io_uring_sqe *sqe) +static bool io_req_op_valid(int op) { - int op = READ_ONCE(sqe->opcode); + return op >= IORING_OP_NOP && op < IORING_OP_LAST; +} - switch (op) { +static int io_req_needs_file(struct io_kiocb *req) +{ + switch (req->opcode) { case IORING_OP_NOP: case IORING_OP_POLL_REMOVE: case IORING_OP_TIMEOUT: case IORING_OP_TIMEOUT_REMOVE: case IORING_OP_ASYNC_CANCEL: case IORING_OP_LINK_TIMEOUT: - return false; + return 0; default: - return true; + if (io_req_op_valid(req->opcode)) + return 1; + return -EINVAL; } } @@ -2804,7 +3359,7 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; unsigned flags; - int fd; + int fd, ret; flags = READ_ONCE(req->sqe->flags); fd = READ_ONCE(req->sqe->fd); @@ -2812,8 +3367,9 @@ static int io_req_set_file(struct io_submit_state *state, struct io_kiocb *req) if (flags & IOSQE_IO_DRAIN) req->flags |= REQ_F_IO_DRAIN; - if (!io_op_needs_file(req->sqe)) - return 0; + ret = io_req_needs_file(req); + if (ret <= 0) + return ret; if (flags & IOSQE_FIXED_FILE) { if (unlikely(!ctx->file_table || @@ -2876,10 +3432,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) * We don't expect the list to be empty, that will only happen if we * race with the completion of the linked work. */ - if (!list_empty(&req->list)) { - prev = list_entry(req->list.prev, struct io_kiocb, link_list); + if (!list_empty(&req->link_list)) { + prev = list_entry(req->link_list.prev, struct io_kiocb, + link_list); if (refcount_inc_not_zero(&prev->refs)) { - list_del_init(&req->list); + list_del_init(&req->link_list); prev->flags &= ~REQ_F_LINK_TIMEOUT; } else prev = NULL; @@ -2888,8 +3445,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) spin_unlock_irqrestore(&ctx->completion_lock, flags); if (prev) { - if (prev->flags & REQ_F_LINK) - prev->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(prev); io_async_find_and_cancel(ctx, req, prev->user_data, NULL, -ETIME); io_put_req(prev); @@ -2909,8 +3465,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) * we got a chance to setup the timer */ spin_lock_irq(&ctx->completion_lock); - if (!list_empty(&req->list)) { - struct io_timeout_data *data = req->timeout.data; + if (!list_empty(&req->link_list)) { + struct io_timeout_data *data = &req->io->timeout; data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), @@ -2929,8 +3485,9 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) if (!(req->flags & REQ_F_LINK)) return NULL; - nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); - if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) + nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); + if (!nxt || nxt->opcode != IORING_OP_LINK_TIMEOUT) return NULL; req->flags |= REQ_F_LINK_TIMEOUT; @@ -2939,13 +3496,14 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; struct io_kiocb *nxt = NULL; int ret; +again: + linked_timeout = io_prep_linked_timeout(req); + ret = io_issue_sqe(req, &nxt, true); - if (nxt) - io_queue_async_work(nxt); /* * We async punt it if the file wasn't marked NOWAIT, or if the file @@ -2953,15 +3511,6 @@ static void __io_queue_sqe(struct io_kiocb *req) */ if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || (req->flags & REQ_F_MUST_PUNT))) { - struct io_uring_sqe *sqe_copy; - - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) - goto err; - - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; - if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { ret = io_grab_files(req); if (ret) @@ -2973,7 +3522,7 @@ static void __io_queue_sqe(struct io_kiocb *req) * submit reference when the iocb is actually submitted. */ io_queue_async_work(req); - return; + goto done_req; } err: @@ -2990,10 +3539,15 @@ err: /* and drop final reference, if we failed */ if (ret) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_put_req(req); } +done_req: + if (nxt) { + req = nxt; + nxt = NULL; + goto again; + } } static void io_queue_sqe(struct io_kiocb *req) @@ -3010,8 +3564,7 @@ static void io_queue_sqe(struct io_kiocb *req) if (ret) { if (ret != -EIOCBQUEUED) { io_cqring_add_event(req, ret); - if (req->flags & REQ_F_LINK) - req->flags |= REQ_F_FAIL_LINK; + req_set_fail_links(req); io_double_put_req(req); } } else @@ -3027,17 +3580,15 @@ static inline void io_queue_link_head(struct io_kiocb *req) io_queue_sqe(req); } +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) - -static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, +static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, struct io_kiocb **link) { struct io_ring_ctx *ctx = req->ctx; int ret; - req->user_data = req->sqe->user_data; - /* enforce forwards compatibility on users */ if (unlikely(req->sqe->flags & ~SQE_VALID_FLAGS)) { ret = -EINVAL; @@ -3049,7 +3600,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, err_req: io_cqring_add_event(req, ret); io_double_put_req(req); - return; + return false; } /* @@ -3061,40 +3612,38 @@ err_req: */ if (*link) { struct io_kiocb *prev = *link; - struct io_uring_sqe *sqe_copy; if (req->sqe->flags & IOSQE_IO_DRAIN) (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; - if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { - ret = io_timeout_setup(req); - /* common setup allows offset being set, we don't */ - if (!ret && req->sqe->off) - ret = -EINVAL; - if (ret) { - prev->flags |= REQ_F_FAIL_LINK; - goto err_req; - } - } + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; - sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL); - if (!sqe_copy) { + if (io_alloc_async_ctx(req)) { ret = -EAGAIN; goto err_req; } - req->sqe = sqe_copy; - req->flags |= REQ_F_FREE_SQE; + ret = io_req_defer_prep(req); + if (ret) { + /* fail even hard links since we don't submit */ + prev->flags |= REQ_F_FAIL_LINK; + goto err_req; + } trace_io_uring_link(ctx, req, prev); - list_add_tail(&req->list, &prev->link_list); - } else if (req->sqe->flags & IOSQE_IO_LINK) { + list_add_tail(&req->link_list, &prev->link_list); + } else if (req->sqe->flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) { req->flags |= REQ_F_LINK; + if (req->sqe->flags & IOSQE_IO_HARDLINK) + req->flags |= REQ_F_HARDLINK; INIT_LIST_HEAD(&req->link_list); *link = req; } else { io_queue_sqe(req); } + + return true; } /* @@ -3113,7 +3662,7 @@ static void io_submit_state_end(struct io_submit_state *state) * Start submission side cache. */ static void io_submit_state_start(struct io_submit_state *state, - struct io_ring_ctx *ctx, unsigned max_ios) + unsigned int max_ios) { blk_start_plug(&state->plug); state->free_reqs = 0; @@ -3136,7 +3685,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx) } /* - * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * Fetch an sqe, if one is available. Note that req->sqe will point to memory * that is mapped by userspace. This means that care needs to be taken to * ensure that reads are stable, as we cannot rely on userspace always * being a good citizen. If members of the sqe are validated and then later @@ -3171,6 +3720,8 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct io_kiocb *req) */ req->sequence = ctx->cached_sq_head; req->sqe = &ctx->sq_sqes[head]; + req->opcode = READ_ONCE(req->sqe->opcode); + req->user_data = READ_ONCE(req->sqe->user_data); ctx->cached_sq_head++; return true; } @@ -3197,7 +3748,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, return -EBUSY; if (nr > IO_PLUG_THRESHOLD) { - io_submit_state_start(&state, ctx, nr); + io_submit_state_start(&state, nr); statep = &state; } @@ -3216,7 +3767,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, break; } - if (io_sqe_needs_user(req->sqe) && !*mm) { + if (io_req_needs_user(req) && !*mm) { mm_fault = mm_fault || !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -3224,6 +3775,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, } } + submitted++; sqe_flags = req->sqe->flags; req->ring_file = ring_file; @@ -3231,16 +3783,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, req->has_user = *mm != NULL; req->in_async = async; req->needs_fixed_file = async; - trace_io_uring_submit_sqe(ctx, req->sqe->user_data, - true, async); - io_submit_sqe(req, statep, &link); - submitted++; - + trace_io_uring_submit_sqe(ctx, req->user_data, true, async); + if (!io_submit_sqe(req, statep, &link)) + break; /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ - if (!(sqe_flags & IOSQE_IO_LINK) && link) { + if (!(sqe_flags & (IOSQE_IO_LINK|IOSQE_IO_HARDLINK)) && link) { io_queue_link_head(link); link = NULL; } @@ -3369,7 +3919,9 @@ static int io_sq_thread(void *data) } to_submit = min(to_submit, ctx->sq_entries); + mutex_lock(&ctx->uring_lock); ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true); + mutex_unlock(&ctx->uring_lock); if (ret > 0) inflight += ret; } @@ -3398,7 +3950,7 @@ static inline bool io_should_wake(struct io_wait_queue *iowq, bool noflush) struct io_ring_ctx *ctx = iowq->ctx; /* - * Wake up if we have enough events, or if a timeout occured since we + * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ @@ -4363,6 +4915,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) free_uid(ctx->user); put_cred(ctx->creds); kfree(ctx->completions); + kfree(ctx->cancel_hash); kmem_cache_free(req_cachep, ctx->fallback_req); kfree(ctx); } @@ -4587,6 +5140,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, submitted = io_submit_sqes(ctx, to_submit, f.file, fd, &cur_mm, false); mutex_unlock(&ctx->uring_lock); + + if (submitted != to_submit) + goto out; } if (flags & IORING_ENTER_GETEVENTS) { unsigned nr_events = 0; @@ -4600,6 +5156,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } +out: percpu_ref_put(&ctx->refs); out_fput: fdput(f); @@ -4759,7 +5316,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ctx->compat = in_compat_syscall(); ctx->account_mem = account_mem; ctx->user = user; - ctx->creds = prepare_creds(); + ctx->creds = get_current_cred(); ret = io_allocate_scq_urings(ctx, p); if (ret) @@ -4794,7 +5351,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret < 0) goto err; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; + p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | + IORING_FEAT_SUBMIT_STABLE; trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); return ret; err: |