diff options
Diffstat (limited to 'io_uring')
49 files changed, 1927 insertions, 929 deletions
diff --git a/io_uring/Makefile b/io_uring/Makefile index 3e28a741ca15..b3f1bd492804 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,11 +7,11 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ + tctx.o filetable.o rw.o poll.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - statx.o timeout.o fdinfo.o cancel.o \ + statx.o timeout.o cancel.o \ waitid.o register.o truncate.o \ memmap.o alloc_cache.o obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o @@ -19,3 +19,6 @@ obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o obj-$(CONFIG_EPOLL) += epoll.o obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o +obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o diff --git a/io_uring/advise.c b/io_uring/advise.c index cb7b881665e5..0073f74e3658 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif @@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 0870060bac7c..6d57602304df 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -229,7 +229,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_sync_cancel(struct io_uring_task *tctx, diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 000000000000..3866fe6ff541 --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,165 @@ +#include <asm/ioctls.h> +#include <linux/io_uring/net.h> +#include <linux/errqueue.h> +#include <net/sock.h> + +#include "uring_cmd.h" + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +static bool io_process_timestamp_skb(struct io_uring_cmd *cmd, struct sock *sk, + struct sk_buff *skb, unsigned issue_flags) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct io_uring_cqe cqe[2]; + struct io_timespec *iots; + struct timespec64 ts; + u32 tstype, tskey; + int ret; + + BUILD_BUG_ON(sizeof(struct io_uring_cqe) != sizeof(struct io_timespec)); + + ret = skb_get_tx_timestamp(skb, sk, &ts); + if (ret < 0) + return false; + + tskey = serr->ee.ee_data; + tstype = serr->ee.ee_info; + + cqe->user_data = 0; + cqe->res = tskey; + cqe->flags = IORING_CQE_F_MORE; + cqe->flags |= tstype << IORING_TIMESTAMP_TYPE_SHIFT; + if (ret == SOF_TIMESTAMPING_TX_HARDWARE) + cqe->flags |= IORING_CQE_F_TSTAMP_HW; + + iots = (struct io_timespec *)&cqe[1]; + iots->tv_sec = ts.tv_sec; + iots->tv_nsec = ts.tv_nsec; + return io_uring_cmd_post_mshot_cqe32(cmd, issue_flags, cqe); +} + +static int io_uring_cmd_timestamp(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct sock *sk = sock->sk; + struct sk_buff_head *q = &sk->sk_error_queue; + struct sk_buff *skb, *tmp; + struct sk_buff_head list; + int ret; + + if (!(issue_flags & IO_URING_F_CQE32)) + return -EINVAL; + ret = io_cmd_poll_multishot(cmd, issue_flags, EPOLLERR); + if (unlikely(ret)) + return ret; + + if (skb_queue_empty_lockless(q)) + return -EAGAIN; + __skb_queue_head_init(&list); + + scoped_guard(spinlock_irq, &q->lock) { + skb_queue_walk_safe(q, skb, tmp) { + /* don't support skbs with payload */ + if (!skb_has_tx_timestamp(skb, sk) || skb->len) + continue; + __skb_unlink(skb, q); + __skb_queue_tail(&list, skb); + } + } + + while (1) { + skb = skb_peek(&list); + if (!skb) + break; + if (!io_process_timestamp_skb(cmd, sk, skb, issue_flags)) + break; + __skb_dequeue(&list); + consume_skb(skb); + } + + if (!unlikely(skb_queue_empty(&list))) { + scoped_guard(spinlock_irqsave, &q->lock) + skb_queue_splice(q, &list); + } + return -EAGAIN; +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_TX_TIMESTAMP: + return io_uring_cmd_timestamp(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 6d2c48ba1923..8d4610246ba0 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -61,7 +61,7 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -88,5 +88,5 @@ int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index 100d5da94cb9..78f8ab7db104 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -47,13 +47,6 @@ static void io_eventfd_do_signal(struct rcu_head *rcu) io_eventfd_put(ev_fd); } -static void io_eventfd_release(struct io_ev_fd *ev_fd, bool put_ref) -{ - if (put_ref) - io_eventfd_put(ev_fd); - rcu_read_unlock(); -} - /* * Returns true if the caller should put the ev_fd reference, false if not. */ @@ -72,63 +65,34 @@ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) /* * Trigger if eventfd_async isn't set, or if it's set and the caller is - * an async worker. If ev_fd isn't valid, obviously return false. + * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - if (ev_fd) - return !ev_fd->eventfd_async || io_wq_current_is_worker(); - return false; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); } -/* - * On success, returns with an ev_fd reference grabbed and the RCU read - * lock held. - */ -static struct io_ev_fd *io_eventfd_grab(struct io_ring_ctx *ctx) +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { + bool skip = false; struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) - return NULL; - - rcu_read_lock(); + return; - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ + guard(rcu)(); ev_fd = rcu_dereference(ctx->io_ev_fd); - /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (io_eventfd_trigger(ev_fd) && refcount_inc_not_zero(&ev_fd->refs)) - return ev_fd; - - rcu_read_unlock(); - return NULL; -} - -void io_eventfd_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) - io_eventfd_release(ev_fd, __io_eventfd_signal(ev_fd)); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - struct io_ev_fd *ev_fd; - - ev_fd = io_eventfd_grab(ctx); - if (ev_fd) { - bool skip, put_ref = true; + if (!ev_fd) + return; + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) + return; + if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd @@ -142,12 +106,10 @@ void io_eventfd_flush_signal(struct io_ring_ctx *ctx) skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); - - if (!skip) - put_ref = __io_eventfd_signal(ev_fd); - - io_eventfd_release(ev_fd, put_ref); } + + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c6321..e2f1985c2cf9 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index e0d6a59a89fa..9798d6fb4ec7 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,37 +15,6 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) -{ - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); - } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; -} - #ifdef CONFIG_NET_RX_BUSY_POLL static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m, @@ -172,18 +141,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; + struct task_struct *tsk; + rcu_read_lock(); + tsk = rcu_dereference(sq->thread); /* * sq->thread might be NULL if we raced with the sqpoll * thread termination. */ - if (sq->thread) { + if (tsk) { + get_task_struct(tsk); + rcu_read_unlock(); + getrusage(tsk, RUSAGE_SELF, &sq_usage); + put_task_struct(tsk); sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; - getrusage(sq->thread, RUSAGE_SELF, &sq_usage); sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + sq_usage.ru_stime.tv_usec); sq_work_time = sq->work_time; + } else { + rcu_read_unlock(); } } @@ -214,14 +191,6 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) else seq_printf(m, "%5u: <none>\n", i); } - if (!xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { @@ -264,4 +233,3 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) mutex_unlock(&ctx->uring_lock); } } -#endif diff --git a/io_uring/fs.c b/io_uring/fs.c index eccea851dd5a..37079a414eab 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) @@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) @@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) @@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) diff --git a/io_uring/futex.c b/io_uring/futex.c index 0ea4820cd8ff..692462d50c8c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -14,10 +14,7 @@ struct io_futex { struct file *file; - union { - u32 __user *uaddr; - struct futex_waitv __user *uwaitv; - }; + void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; unsigned long futexv_owned; @@ -148,6 +145,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); return 0; } @@ -186,13 +185,15 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!futexv) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, + ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { kfree(futexv); return ret; } + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; @@ -234,7 +235,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; - return IOU_OK; + return IOU_COMPLETE; } /* @@ -273,7 +274,6 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_ring_ctx *ctx = req->ctx; struct io_futex_data *ifd = NULL; - struct futex_hash_bucket *hb; int ret; if (!iof->futex_mask) { @@ -295,12 +295,11 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) ifd->req = req; ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, - &ifd->q, &hb); + &ifd->q, NULL, NULL); if (!ret) { hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb, NULL); return IOU_ISSUE_SKIP_COMPLETE; } @@ -311,7 +310,7 @@ done: req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); - return IOU_OK; + return IOU_COMPLETE; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) @@ -328,5 +327,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index 04a75d666195..17dfaa0395c4 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -114,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -153,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return __io_get_work_hash(atomic_read(&work->flags)); +} + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -350,6 +357,13 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; acct = worker->acct; + + rcu_read_lock(); + do_create = !io_acct_activate_free_worker(acct); + rcu_read_unlock(); + if (!do_create) + goto no_need_create; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { @@ -360,6 +374,7 @@ static void create_worker_cb(struct callback_head *cb) if (do_create) { create_io_worker(wq, acct); } else { +no_need_create: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); } @@ -412,6 +427,30 @@ fail: return false; } +/* Defer if current and next work are both hashed to the same chain */ +static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct) +{ + unsigned int hash, work_flags; + struct io_wq_work *next; + + lockdep_assert_held(&acct->lock); + + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + + /* should not happen, io_acct_run_queue() said we had work */ + if (wq_list_empty(&acct->work_list)) + return true; + + hash = __io_get_work_hash(work_flags); + next = container_of(acct->work_list.first, struct io_wq_work, list); + work_flags = atomic_read(&next->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + return hash == __io_get_work_hash(work_flags); +} + static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); @@ -422,8 +461,14 @@ static void io_wq_dec_running(struct io_worker *worker) if (!atomic_dec_and_test(&acct->nr_running)) return; + if (!worker->cur_work) + return; if (!io_acct_run_queue(acct)) return; + if (io_wq_hash_defer(worker->cur_work, acct)) { + raw_spin_unlock(&acct->lock); + return; + } raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); @@ -457,16 +502,6 @@ static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) } } -static inline unsigned int __io_get_work_hash(unsigned int work_flags) -{ - return work_flags >> IO_WQ_HASH_SHIFT; -} - -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return __io_get_work_hash(atomic_read(&work->flags)); -} - static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { bool ret = false; @@ -612,10 +647,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (do_kill && (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -934,8 +969,8 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } @@ -1195,8 +1230,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1206,8 +1239,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; @@ -1236,8 +1267,10 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) + if (ret) { + put_task_struct(wq->task); goto err; + } return wq; err: diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index d4fb2940e435..774abab54732 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,9 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index edda31a15c6e..4ef69dd58734 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -114,11 +114,11 @@ #define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ - REQ_F_ASYNC_DATA) + REQ_F_INFLIGHT | REQ_F_CREDS | REQ_F_ASYNC_DATA) #define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ - REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) + REQ_F_REISSUE | REQ_F_POLLED | \ + IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -129,7 +129,6 @@ struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ @@ -148,7 +147,8 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, bool cancel_all, bool is_sqpoll_thread); -static void io_queue_sqe(struct io_kiocb *req); +static void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags); +static void __io_req_caches_free(struct io_ring_ctx *ctx); static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); @@ -359,6 +359,7 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -380,25 +381,6 @@ err: return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) @@ -410,11 +392,6 @@ static void io_clean_op(struct io_kiocb *req) if (def->cleanup) def->cleanup(req); } - if ((req->flags & REQ_F_POLLED) && req->apoll) { - kfree(req->apoll->double_poll); - kfree(req->apoll); - req->apoll = NULL; - } if (req->flags & REQ_F_INFLIGHT) atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) @@ -426,7 +403,12 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -537,20 +519,37 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } +static unsigned io_linked_nr(struct io_kiocb *req) +{ + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) { - spin_lock(&ctx->completion_lock); + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } - spin_unlock(&ctx->completion_lock); } void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -559,10 +558,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) - io_queue_deferred(ctx); if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) @@ -701,27 +698,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -730,23 +720,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; } /* @@ -786,18 +788,26 @@ bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow) return true; } +static bool io_fill_cqe_aux32(struct io_ring_ctx *ctx, + struct io_uring_cqe src_cqe[2]) +{ + struct io_uring_cqe *cqe; + + if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_CQE32))) + return false; + if (unlikely(!io_get_cqe(ctx, &cqe))) + return false; + + memcpy(cqe, src_cqe, 2 * sizeof(*cqe)); + trace_io_uring_complete(ctx, NULL, cqe); + return true; +} + static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ if (likely(io_get_cqe(ctx, &cqe))) { WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); @@ -814,14 +824,43 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, return false; } +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) +{ + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} + +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); +} + bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { bool filled; io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } @@ -832,10 +871,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->lockless_cq); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - spin_unlock(&ctx->completion_lock); + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); } ctx->submit_state.cq_flush = true; } @@ -872,6 +914,31 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) return posted; } +/* + * A helper for multishot requests posting additional CQEs. + * Should only be used from a task_work including IO_URING_F_MULTISHOT. + */ +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe cqe[2]) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + lockdep_assert(!io_wq_current_is_worker()); + lockdep_assert_held(&ctx->uring_lock); + + cqe[0].user_data = req->cqe.user_data; + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux32(ctx, cqe); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux32(ctx, cqe); + } + + ctx->submit_state.cq_flush = true; + return posted; +} + static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -925,22 +992,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->buf_node = NULL; - req->file_node = NULL; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -949,7 +1000,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -967,10 +1018,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -1192,7 +1244,7 @@ static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); @@ -1360,7 +1412,7 @@ void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); else - io_queue_sqe(req); + io_queue_sqe(req, 0); } void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -1384,6 +1436,16 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -1444,13 +1506,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) */ if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); @@ -1459,6 +1518,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1495,6 +1558,9 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) @@ -1646,56 +1712,28 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; + + ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1757,7 +1795,7 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) ret = __io_issue_sqe(req, issue_flags, def); - if (ret == IOU_OK) { + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else @@ -1816,7 +1854,7 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else @@ -1914,7 +1952,8 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->file_table.data, fd); if (node) { - io_req_assign_rsrc_node(&req->file_node, node); + node->refs++; + req->file_node = node; req->flags |= io_slot_flags(node); file = io_slot_file(node); } @@ -1934,14 +1973,34 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) return file; } -static void io_queue_async(struct io_kiocb *req, int ret) +static int io_req_sqe_copy(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_cold_def *def = &io_cold_defs[req->opcode]; + + if (req->flags & REQ_F_SQE_COPIED) + return 0; + req->flags |= REQ_F_SQE_COPIED; + if (!def->sqe_copy) + return 0; + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_INLINE))) + return -EFAULT; + def->sqe_copy(req); + return 0; +} + +static void io_queue_async(struct io_kiocb *req, unsigned int issue_flags, int ret) __must_hold(&req->ctx->uring_lock) { if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { +fail: io_req_defer_failed(req, ret); return; } + ret = io_req_sqe_copy(req, issue_flags); + if (unlikely(ret)) + goto fail; + switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1956,19 +2015,21 @@ static void io_queue_async(struct io_kiocb *req, int ret) } } -static inline void io_queue_sqe(struct io_kiocb *req) +static inline void io_queue_sqe(struct io_kiocb *req, unsigned int extra_flags) __must_hold(&req->ctx->uring_lock) { + unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_COMPLETE_DEFER | extra_flags; int ret; - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + ret = io_issue_sqe(req, issue_flags); /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (unlikely(ret)) - io_queue_async(req, ret); + io_queue_async(req, issue_flags, ret); } static void io_queue_sqe_fallback(struct io_kiocb *req) @@ -1983,6 +2044,8 @@ static void io_queue_sqe_fallback(struct io_kiocb *req) req->flags |= REQ_F_LINK; io_req_defer_failed(req, req->cqe.res); } else { + /* can't fail with IO_URING_F_INLINE */ + io_req_sqe_copy(req, IO_URING_F_INLINE); if (unlikely(req->ctx->drain_active)) io_drain_req(req); else @@ -2047,7 +2110,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); @@ -2194,6 +2257,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, */ if (unlikely(link->head)) { trace_io_uring_link(req, link->last); + io_req_sqe_copy(req, IO_URING_F_INLINE); link->last->link = req; link->last = req; @@ -2217,7 +2281,7 @@ fallback: return 0; } - io_queue_sqe(req); + io_queue_sqe(req, IO_URING_F_INLINE); return 0; } @@ -2278,10 +2342,6 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; @@ -2699,21 +2759,26 @@ unsigned long rings_size(unsigned int flags, unsigned int sq_entries, return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -2749,6 +2814,9 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); @@ -2883,7 +2951,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } - if (ctx->ifq) { + if (!xa_empty(&ctx->zcrx_ctxs)) { mutex_lock(&ctx->uring_lock); io_shutdown_zcrx_ifqs(ctx); mutex_unlock(&ctx->uring_lock); @@ -2901,7 +2969,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -3015,20 +3083,19 @@ static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3103,8 +3170,8 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, tctx, cancel_all); mutex_lock(&ctx->uring_lock); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); ret |= io_poll_remove_all(ctx, tctx, cancel_all); ret |= io_waitid_remove_all(ctx, tctx, cancel_all); ret |= io_futex_remove_all(ctx, tctx, cancel_all); @@ -3138,7 +3205,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) s64 inflight; DEFINE_WAIT(wait); - WARN_ON_ONCE(sqd && sqd->thread != current); + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); if (!current->io_uring) return; @@ -3914,6 +3981,8 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u8, write_stream); + BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index e4050b2d0821..abc6de227f74 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -19,7 +19,6 @@ #endif enum { - IOU_OK = 0, /* deprecated, use IOU_COMPLETE */ IOU_COMPLETE = 0, IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, @@ -82,8 +81,10 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res); bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags); bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); +bool io_req_post_cqe32(struct io_kiocb *req, struct io_uring_cqe src_cqe[2]); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); @@ -98,8 +99,6 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *coun struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); @@ -196,7 +195,6 @@ static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, { io_lockdep_assert_cq_locked(ctx); - ctx->cq_extra++; ctx->submit_state.cq_flush = true; return io_get_cqe(ctx, cqe_ret); } @@ -296,11 +294,22 @@ static inline void io_commit_cqring(struct io_ring_ctx *ctx) smp_store_release(&ctx->rings->cq.tail, ctx->cached_cq_tail); } +static inline void __io_wq_wake(struct wait_queue_head *wq) +{ + /* + * + * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter + * set in the mask so that if we recurse back into our own poll + * waitqueue handlers, we know we have a dependency between eventfd or + * epoll and should terminate multishot poll at that point. + */ + if (wq_has_sleeper(wq)) + __wake_up(wq, TASK_NORMAL, 0, poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); +} + static inline void io_poll_wq_wake(struct io_ring_ctx *ctx) { - if (wq_has_sleeper(&ctx->poll_wq)) - __wake_up(&ctx->poll_wq, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + __io_wq_wake(&ctx->poll_wq); } static inline void io_cqring_wake(struct io_ring_ctx *ctx) @@ -309,15 +318,9 @@ static inline void io_cqring_wake(struct io_ring_ctx *ctx) * Trigger waitqueue handler on all waiters on our waitqueue. This * won't necessarily wake up all the tasks, io_should_wake() will make * that decision. - * - * Pass in EPOLLIN|EPOLL_URING_WAKE as the poll wakeup key. The latter - * set in the mask so that if we recurse back into our own poll - * waitqueue handlers, we know we have a dependency between eventfd or - * epoll and should terminate multishot poll at that point. */ - if (wq_has_sleeper(&ctx->cq_wait)) - __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, - poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + + __io_wq_wake(&ctx->cq_wait); } static inline bool io_sqring_full(struct io_ring_ctx *ctx) @@ -414,7 +417,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 953d5e742569..f2d2cc319faa 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -92,7 +92,6 @@ void io_kbuf_drop_legacy(struct io_kiocb *req) { if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) return; - req->buf_index = req->kbuf->bgid; req->flags &= ~REQ_F_BUFFER_SELECTED; kfree(req->kbuf); req->kbuf = NULL; @@ -109,8 +108,8 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); + bl->nbufs++; req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); return true; @@ -124,6 +123,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); + bl->nbufs--; if (*len == 0 || *len > kbuf->len) *len = kbuf->len; if (list_empty(&bl->buf_list)) @@ -193,7 +193,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -201,7 +201,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, io_ring_submit_lock(req->ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); @@ -270,8 +270,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, /* truncate end piece, if needed, for non partial buffers */ if (len > arg->max_len) { len = arg->max_len; - if (!(bl->flags & IOBL_INC)) + if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; + if (iov != arg->iovs) + break; buf->len = len; + } } iov->iov_base = u64_to_user_ptr(buf->addr); @@ -302,7 +306,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) goto out_unlock; @@ -335,7 +339,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) lockdep_assert_held(&ctx->uring_lock); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) return -ENOENT; @@ -355,10 +359,9 @@ static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) struct io_buffer_list *bl = req->buf_list; bool ret = true; - if (bl) { + if (bl) ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } + req->flags &= ~REQ_F_BUFFER_RING; return ret; } @@ -379,45 +382,34 @@ unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) return ret; } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) { - unsigned i = 0; - - /* shouldn't happen */ - if (!nbufs) - return 0; - - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - io_free_region(ctx, &bl->region); - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; - } + unsigned long i = 0; + struct io_buffer *nxt; /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&nxt->list); + bl->nbufs--; kfree(nxt); - - if (++i == nbufs) - return i; cond_resched(); } - return i; } static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - __io_remove_buffers(ctx, bl, -1U); + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + kfree(bl); } @@ -465,30 +457,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -512,8 +480,6 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; @@ -532,14 +498,24 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, { struct io_buffer *buf; u64 addr = pbuf->addr; - int i, bid = pbuf->bid; + int ret = -ENOMEM, i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { + /* + * Nonsensical to have more than sizeof(bid) buffers in a + * buffer list, as the application then has no way of knowing + * which duplicate bid refers to what buffer. + */ + if (bl->nbufs == USHRT_MAX) { + ret = -EOVERFLOW; + break; + } buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) break; list_add_tail(&buf->list, &bl->buf_list); + bl->nbufs++; buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -549,52 +525,59 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, cond_resched(); } - return i ? 0 : -ENOMEM; + return i ? 0 : ret; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { kfree(bl); - goto err; + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; struct io_uring_region_desc rd; struct io_uring_buf_ring *br; unsigned long mmap_offset; @@ -605,8 +588,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; @@ -624,7 +606,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) io_destroy_bl(ctx, bl); } - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); if (!bl) return -ENOMEM; @@ -669,7 +651,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; fail: io_free_region(ctx, &bl->region); - kfree(free_bl); + kfree(bl); return ret; } @@ -682,9 +664,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -704,14 +684,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 2ec0b983ce24..723d0361898e 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -21,6 +21,9 @@ struct io_buffer_list { struct list_head buf_list; struct io_uring_buf_ring *buf_ring; }; + /* count of classic/legacy buffers in buffer list */ + int nbufs; + __u16 bgid; /* below is for ring provided buffers */ @@ -55,20 +58,20 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned short buf_group; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, unsigned int issue_flags); int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); @@ -94,7 +97,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - req->buf_index = req->buf_list->bgid; req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 07f8a5cbd37e..2e99dffddfc5 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -13,6 +13,7 @@ #include "memmap.h" #include "kbuf.h" #include "rsrc.h" +#include "zcrx.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -155,7 +156,7 @@ static int io_region_allocate_pages(struct io_ring_ctx *ctx, unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - unsigned long size = mr->nr_pages << PAGE_SHIFT; + size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; unsigned long nr_allocated; struct page **pages; void *p; @@ -258,7 +259,8 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { loff_t offset = pgoff << PAGE_SHIFT; - unsigned int bgid; + unsigned int id; + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: @@ -267,12 +269,13 @@ static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, case IORING_OFF_SQES: return &ctx->sq_region; case IORING_OFF_PBUF_RING: - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - return io_pbuf_get_region(ctx, bgid); + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, id); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; case IORING_MAP_OFF_ZCRX_REGION: - return &ctx->zcrx_region; + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; + return io_zcrx_get_region(ctx, id); } return NULL; } diff --git a/io_uring/memmap.h b/io_uring/memmap.h index dad0aa5b1b45..08419684e4bc 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,7 +4,9 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL #define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); +#define IORING_OFF_ZCRX_SHIFT 16 + +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); diff --git a/io_uring/mock_file.c b/io_uring/mock_file.c new file mode 100644 index 000000000000..45d3735b2708 --- /dev/null +++ b/io_uring/mock_file.c @@ -0,0 +1,363 @@ +#include <linux/device.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/anon_inodes.h> +#include <linux/ktime.h> +#include <linux/hrtimer.h> +#include <linux/poll.h> + +#include <linux/io_uring/cmd.h> +#include <linux/io_uring_types.h> +#include <uapi/linux/io_uring/mock_file.h> + +struct io_mock_iocb { + struct kiocb *iocb; + struct hrtimer timer; + int res; +}; + +struct io_mock_file { + size_t size; + u64 rw_delay_ns; + bool pollable; + struct wait_queue_head poll_wq; +}; + +#define IO_VALID_COPY_CMD_FLAGS IORING_MOCK_COPY_FROM + +static int io_copy_regbuf(struct iov_iter *reg_iter, void __user *ubuf) +{ + size_t ret, copied = 0; + size_t buflen = PAGE_SIZE; + void *tmp_buf; + + tmp_buf = kzalloc(buflen, GFP_KERNEL); + if (!tmp_buf) + return -ENOMEM; + + while (iov_iter_count(reg_iter)) { + size_t len = min(iov_iter_count(reg_iter), buflen); + + if (iov_iter_rw(reg_iter) == ITER_SOURCE) { + ret = copy_from_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + if (copy_to_user(ubuf, tmp_buf, ret)) + break; + } else { + if (copy_from_user(tmp_buf, ubuf, len)) + break; + ret = copy_to_iter(tmp_buf, len, reg_iter); + if (ret <= 0) + break; + } + ubuf += ret; + copied += ret; + } + + kfree(tmp_buf); + return copied; +} + +static int io_cmd_copy_regbuf(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + const struct iovec __user *iovec; + unsigned flags, iovec_len; + struct iov_iter iter; + void __user *ubuf; + int dir, ret; + + ubuf = u64_to_user_ptr(READ_ONCE(sqe->addr3)); + iovec = u64_to_user_ptr(READ_ONCE(sqe->addr)); + iovec_len = READ_ONCE(sqe->len); + flags = READ_ONCE(sqe->file_index); + + if (unlikely(sqe->ioprio || sqe->__pad1)) + return -EINVAL; + if (flags & ~IO_VALID_COPY_CMD_FLAGS) + return -EINVAL; + + dir = (flags & IORING_MOCK_COPY_FROM) ? ITER_SOURCE : ITER_DEST; + ret = io_uring_cmd_import_fixed_vec(cmd, iovec, iovec_len, dir, &iter, + issue_flags); + if (ret) + return ret; + ret = io_copy_regbuf(&iter, ubuf); + return ret ? ret : -EFAULT; +} + +static int io_mock_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + switch (cmd->cmd_op) { + case IORING_MOCK_CMD_COPY_REGBUF: + return io_cmd_copy_regbuf(cmd, issue_flags); + } + return -ENOTSUPP; +} + +static enum hrtimer_restart io_mock_rw_timer_expired(struct hrtimer *timer) +{ + struct io_mock_iocb *mio = container_of(timer, struct io_mock_iocb, timer); + struct kiocb *iocb = mio->iocb; + + WRITE_ONCE(iocb->private, NULL); + iocb->ki_complete(iocb, mio->res); + kfree(mio); + return HRTIMER_NORESTART; +} + +static ssize_t io_mock_delay_rw(struct kiocb *iocb, size_t len) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + struct io_mock_iocb *mio; + + mio = kzalloc(sizeof(*mio), GFP_KERNEL); + if (!mio) + return -ENOMEM; + + mio->iocb = iocb; + mio->res = len; + hrtimer_setup(&mio->timer, io_mock_rw_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_start(&mio->timer, ns_to_ktime(mf->rw_delay_ns), + HRTIMER_MODE_REL); + return -EIOCBQUEUED; +} + +static ssize_t io_mock_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(to); + size_t nr_zeroed; + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + nr_zeroed = iov_iter_zero(len, to); + if (!mf->rw_delay_ns || nr_zeroed != len) + return nr_zeroed; + + return io_mock_delay_rw(iocb, len); +} + +static ssize_t io_mock_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct io_mock_file *mf = iocb->ki_filp->private_data; + size_t len = iov_iter_count(from); + + if (iocb->ki_pos + len > mf->size) + return -EINVAL; + if (!mf->rw_delay_ns) { + iov_iter_advance(from, len); + return len; + } + + return io_mock_delay_rw(iocb, len); +} + +static loff_t io_mock_llseek(struct file *file, loff_t offset, int whence) +{ + struct io_mock_file *mf = file->private_data; + + return fixed_size_llseek(file, offset, whence, mf->size); +} + +static __poll_t io_mock_poll(struct file *file, struct poll_table_struct *pt) +{ + struct io_mock_file *mf = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &mf->poll_wq, pt); + + mask |= EPOLLOUT | EPOLLWRNORM; + mask |= EPOLLIN | EPOLLRDNORM; + return mask; +} + +static int io_mock_release(struct inode *inode, struct file *file) +{ + struct io_mock_file *mf = file->private_data; + + kfree(mf); + return 0; +} + +static const struct file_operations io_mock_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, +}; + +static const struct file_operations io_mock_poll_fops = { + .owner = THIS_MODULE, + .release = io_mock_release, + .uring_cmd = io_mock_cmd, + .read_iter = io_mock_read_iter, + .write_iter = io_mock_write_iter, + .llseek = io_mock_llseek, + .poll = io_mock_poll, +}; + +#define IO_VALID_CREATE_FLAGS (IORING_MOCK_CREATE_F_SUPPORT_NOWAIT | \ + IORING_MOCK_CREATE_F_POLL) + +static int io_create_mock_file(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + const struct file_operations *fops = &io_mock_fops; + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_create mc, __user *uarg; + struct io_mock_file *mf = NULL; + struct file *file = NULL; + size_t uarg_size; + int fd = -1, ret; + + /* + * It's a testing only driver that allows exercising edge cases + * that wouldn't be possible to hit otherwise. + */ + add_taint(TAINT_TEST, LOCKDEP_STILL_OK); + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index) + return -EINVAL; + if (uarg_size != sizeof(mc)) + return -EINVAL; + + memset(&mc, 0, sizeof(mc)); + if (copy_from_user(&mc, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(mc.__resv, sizeof(mc.__resv))) + return -EINVAL; + if (mc.flags & ~IO_VALID_CREATE_FLAGS) + return -EINVAL; + if (mc.file_size > SZ_1G) + return -EINVAL; + if (mc.rw_delay_ns > NSEC_PER_SEC) + return -EINVAL; + + mf = kzalloc(sizeof(*mf), GFP_KERNEL_ACCOUNT); + if (!mf) + return -ENOMEM; + + ret = fd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (fd < 0) + goto fail; + + init_waitqueue_head(&mf->poll_wq); + mf->size = mc.file_size; + mf->rw_delay_ns = mc.rw_delay_ns; + if (mc.flags & IORING_MOCK_CREATE_F_POLL) { + fops = &io_mock_poll_fops; + mf->pollable = true; + } + + file = anon_inode_create_getfile("[io_uring_mock]", fops, + mf, O_RDWR | O_CLOEXEC, NULL); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto fail; + } + + file->f_mode |= FMODE_READ | FMODE_CAN_READ | + FMODE_WRITE | FMODE_CAN_WRITE | + FMODE_LSEEK; + if (mc.flags & IORING_MOCK_CREATE_F_SUPPORT_NOWAIT) + file->f_mode |= FMODE_NOWAIT; + + mc.out_fd = fd; + if (copy_to_user(uarg, &mc, uarg_size)) { + fput(file); + ret = -EFAULT; + goto fail; + } + + fd_install(fd, file); + return 0; +fail: + if (fd >= 0) + put_unused_fd(fd); + kfree(mf); + return ret; +} + +static int io_probe_mock(struct io_uring_cmd *cmd) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + struct io_uring_mock_probe mp, __user *uarg; + size_t uarg_size; + + uarg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + uarg_size = READ_ONCE(sqe->len); + + if (sqe->ioprio || sqe->__pad1 || sqe->addr3 || sqe->file_index || + uarg_size != sizeof(mp)) + return -EINVAL; + + memset(&mp, 0, sizeof(mp)); + if (copy_from_user(&mp, uarg, uarg_size)) + return -EFAULT; + if (!mem_is_zero(&mp, sizeof(mp))) + return -EINVAL; + + mp.features = IORING_MOCK_FEAT_END; + + if (copy_to_user(uarg, &mp, uarg_size)) + return -EFAULT; + return 0; +} + +static int iou_mock_mgr_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd->cmd_op) { + case IORING_MOCK_MGR_CMD_PROBE: + return io_probe_mock(cmd); + case IORING_MOCK_MGR_CMD_CREATE: + return io_create_mock_file(cmd, issue_flags); + } + return -EOPNOTSUPP; +} + +static const struct file_operations iou_mock_dev_fops = { + .owner = THIS_MODULE, + .uring_cmd = iou_mock_mgr_cmd, +}; + +static struct miscdevice iou_mock_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "io_uring_mock", + .fops = &iou_mock_dev_fops, +}; + +static int __init io_mock_init(void) +{ + int ret; + + ret = misc_register(&iou_mock_miscdev); + if (ret < 0) { + pr_err("Could not initialize io_uring mock device\n"); + return ret; + } + return 0; +} + +static void __exit io_mock_exit(void) +{ + misc_deregister(&iou_mock_miscdev); +} + +module_init(io_mock_init) +module_exit(io_mock_exit) + +MODULE_AUTHOR("Pavel Begunkov <asml.silence@gmail.com>"); +MODULE_DESCRIPTION("io_uring mock file"); +MODULE_LICENSE("GPL"); diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 50a958e9c921..4c2578f2efcb 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) spin_unlock(&ctx->msg_lock); } if (req) - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); percpu_ref_put(&ctx->refs); } @@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { if (!READ_ONCE(ctx->submitter_task)) { - kmem_cache_free(req_cachep, req); + kfree_rcu(req, rcu_head); return -EOWNERDEAD; } req->opcode = IORING_OP_NOP; @@ -328,7 +328,7 @@ done: req_set_fail(req); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) diff --git a/io_uring/net.c b/io_uring/net.c index 27f37fa2ef79..d69f2afa4f7a 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -18,7 +18,6 @@ #include "rsrc.h" #include "zcrx.h" -#if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; @@ -76,13 +75,32 @@ struct io_sr_msg { u16 flags; /* initialised and used only by !msg send variants */ u16 buf_group; - bool retry; + /* per-invocation mshot limit */ + unsigned mshot_len; + /* overall mshot byte limit */ + unsigned mshot_total_len; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; /* + * The UAPI flags are the lower 8 bits, as that's all sqe->ioprio will hold + * anyway. Use the upper 8 bits for internal uses. + */ +enum sr_retry_flags { + IORING_RECV_RETRY = (1U << 15), + IORING_RECV_PARTIAL_MAP = (1U << 14), + IORING_RECV_MSHOT_CAP = (1U << 13), + IORING_RECV_MSHOT_LIM = (1U << 12), + IORING_RECV_MSHOT_DONE = (1U << 11), + + IORING_RECV_RETRY_CLEAR = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP, + IORING_RECV_NO_RETRY = IORING_RECV_RETRY | IORING_RECV_PARTIAL_MAP | + IORING_RECV_MSHOT_CAP | IORING_RECV_MSHOT_DONE, +}; + +/* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from * there. This helps fairness between flooding clients. @@ -129,7 +147,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, shutdown->how); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static bool io_net_retry(struct socket *sock, int flags) @@ -188,9 +206,8 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; - sr->retry = false; - sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; + sr->flags &= ~IORING_RECV_RETRY_CLEAR; + sr->len = sr->mshot_len; } static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, @@ -359,15 +376,17 @@ static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) kmsg->msg.msg_name = &kmsg->addr; kmsg->msg.msg_namelen = addr_len; } - if (sr->flags & IORING_RECVSEND_FIXED_BUF) + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + req->flags |= REQ_F_IMPORT_BUFFER; return 0; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + + if (sr->flags & IORING_SEND_VECTORIZED) + return io_net_import_vec(req, kmsg, sr->buf, sr->len, ITER_SOURCE); + + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -394,14 +413,13 @@ static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } -#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) +#define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE | IORING_SEND_VECTORIZED) int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -409,13 +427,12 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (req->flags & REQ_F_BUFFER_SELECT) + sr->buf_group = req->buf_index; if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; sr->msg_flags |= MSG_WAITALL; - sr->buf_group = req->buf_index; req->buf_list = NULL; req->flags |= REQ_F_MULTISHOT; } @@ -477,6 +494,15 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) return nbufs; } +static int io_net_kbuf_recyle(struct io_kiocb *req, + struct io_async_msghdr *kmsg, int len) +{ + req->flags |= REQ_F_BL_NO_RECYCLE; + if (req->flags & REQ_F_BUFFERS_COMMIT) + io_kbuf_commit(req, req->buf_list, len, io_bundle_nbufs(kmsg, len)); + return IOU_RETRY; +} + static inline bool io_send_finish(struct io_kiocb *req, int *ret, struct io_async_msghdr *kmsg, unsigned issue_flags) @@ -507,7 +533,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, /* Otherwise stop bundle and use the current result. */ finish: io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; + *ret = IOU_COMPLETE; return true; } @@ -545,8 +571,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_controllen = 0; kmsg->msg.msg_control = NULL; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -558,7 +583,7 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, @@ -571,6 +596,7 @@ static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, .iovs = &kmsg->fast_iov, .max_len = min_not_zero(sr->len, INT_MAX), .nr_iovs = 1, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { @@ -656,8 +682,7 @@ retry_bundle: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -723,7 +748,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg; - int ret; kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) @@ -739,13 +763,10 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) kmsg->msg.msg_iocb = NULL; kmsg->msg.msg_ubuf = NULL; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); } return io_recvmsg_copy_hdr(req, kmsg); @@ -759,9 +780,8 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - sr->retry = false; - if (unlikely(sqe->file_index || sqe->addr2)) + if (unlikely(sqe->addr2)) return -EINVAL; sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -786,15 +806,25 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->buf_group = req->buf_index; req->buf_list = NULL; } + sr->mshot_total_len = sr->mshot_len = 0; if (sr->flags & IORING_RECV_MULTISHOT) { if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; if (sr->msg_flags & MSG_WAITALL) return -EINVAL; - if (req->opcode == IORING_OP_RECV && sr->len) + if (req->opcode == IORING_OP_RECV) { + sr->mshot_len = sr->len; + sr->mshot_total_len = READ_ONCE(sqe->optlen); + if (sr->mshot_total_len) + sr->flags |= IORING_RECV_MSHOT_LIM; + } else if (sqe->optlen) { return -EINVAL; + } req->flags |= REQ_F_APOLL_MULTISHOT; + } else if (sqe->optlen) { + return -EINVAL; } + if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_RECVMSG) return -EINVAL; @@ -826,13 +856,28 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, if (kmsg->msg.msg_inq > 0) cflags |= IORING_CQE_F_SOCK_NONEMPTY; + if (*ret > 0 && sr->flags & IORING_RECV_MSHOT_LIM) { + /* + * If sr->len hits zero, the limit has been reached. Mark + * mshot as finished, and flag MSHOT_DONE as well to prevent + * a potential bundle from being retried. + */ + sr->mshot_total_len -= min_t(int, *ret, sr->mshot_total_len); + if (!sr->mshot_total_len) { + sr->flags |= IORING_RECV_MSHOT_DONE; + mshot_finished = true; + } + } + if (sr->flags & IORING_RECVSEND_BUNDLE) { size_t this_ret = *ret - sr->done_io; - cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, this_ret), + cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); - if (sr->retry) + if (sr->flags & IORING_RECV_RETRY) cflags = req->cqe.flags | (cflags & CQE_F_MASK); + if (sr->mshot_len && *ret >= sr->mshot_len) + sr->flags |= IORING_RECV_MSHOT_CAP; /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; @@ -840,12 +885,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, * If more is available AND it was a full transfer, retry and * append to this one */ - if (!sr->retry && kmsg->msg.msg_inq > 0 && this_ret > 0 && + if (!(sr->flags & IORING_RECV_NO_RETRY) && + kmsg->msg.msg_inq > 1 && this_ret > 0 && !iov_iter_count(&kmsg->msg.msg_iter)) { req->cqe.flags = cflags & ~CQE_F_MASK; sr->len = kmsg->msg.msg_inq; sr->done_io += this_ret; - sr->retry = true; + sr->flags |= IORING_RECV_RETRY; return false; } } else { @@ -862,10 +908,13 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { - if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY) + if (sr->nr_multishot_loops++ < MULTISHOT_MAX_RETRY && + !(sr->flags & IORING_RECV_MSHOT_CAP)) { return false; + } /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; + sr->flags &= ~IORING_RECV_MSHOT_CAP; if (issue_flags & IO_URING_F_MULTISHOT) *ret = IOU_REQUEUE; } @@ -991,7 +1040,7 @@ retry_multishot: void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; @@ -1029,8 +1078,7 @@ retry_multishot: } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return IOU_RETRY; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1069,6 +1117,7 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .iovs = &kmsg->fast_iov, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, + .buf_group = sr->buf_group, }; if (kmsg->vec.iovec) { @@ -1077,13 +1126,26 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 0) - arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); + if (*len) + arg.max_len = *len; + else if (kmsg->msg.msg_inq > 1) + arg.max_len = min_not_zero(*len, (size_t) kmsg->msg.msg_inq); + /* if mshot limited, ensure we don't go over */ + if (sr->flags & IORING_RECV_MSHOT_LIM) + arg.max_len = min_not_zero(arg.max_len, sr->mshot_total_len); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + if (arg.partial_map) + sr->flags |= IORING_RECV_PARTIAL_MAP; + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1092,16 +1154,11 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { - kmsg->vec.nr = ret; - kmsg->vec.iovec = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; *len = sr->len; - buf = io_buffer_select(req, len, issue_flags); + buf = io_buffer_select(req, len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; @@ -1167,8 +1224,7 @@ retry_multishot: sr->len -= ret; sr->buf += ret; sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1197,16 +1253,14 @@ int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); unsigned ifq_idx; - if (unlikely(sqe->file_index || sqe->addr2 || sqe->addr || - sqe->addr3)) + if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) return -EINVAL; ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); - if (ifq_idx != 0) - return -EINVAL; - zc->ifq = req->ctx->ifq; + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); if (!zc->ifq) return -EINVAL; + zc->len = READ_ONCE(sqe->len); zc->flags = READ_ONCE(sqe->ioprio); zc->msg_flags = READ_ONCE(sqe->msg_flags); @@ -1273,7 +1327,8 @@ void io_send_zc_cleanup(struct io_kiocb *req) } #define IO_ZC_FLAGS_COMMON (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_FIXED_BUF) -#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE) +#define IO_ZC_FLAGS_VALID (IO_ZC_FLAGS_COMMON | IORING_SEND_ZC_REPORT_USAGE | \ + IORING_SEND_VECTORIZED) int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1284,7 +1339,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int ret; zc->done_io = 0; - zc->retry = false; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1327,8 +1381,6 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -ENOMEM; if (req->opcode == IORING_OP_SEND_ZC) { - if (zc->flags & IORING_RECVSEND_FIXED_BUF) - req->flags |= REQ_F_IMPORT_BUFFER; ret = io_send_setup(req, sqe); } else { if (unlikely(sqe->addr2 || sqe->file_index)) @@ -1453,8 +1505,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) zc->len -= ret; zc->buf += ret; zc->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1476,7 +1527,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1524,8 +1575,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; - req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return io_net_kbuf_recyle(req, kmsg, ret); } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1547,7 +1597,7 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } void io_sendrecv_fail(struct io_kiocb *req) @@ -1711,7 +1761,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) sock->file_slot); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1741,9 +1791,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (unlikely(req->flags & REQ_F_FAIL)) { - ret = -ECONNRESET; - goto out; + if (connect->in_progress) { + struct poll_table_struct pt = { ._key = EPOLLERR }; + + if (vfs_poll(req->file, &pt) & EPOLLERR) + goto get_sock_err; } file_flags = force_nonblock ? O_NONBLOCK : 0; @@ -1768,8 +1820,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) * which means the previous result is good. For both of these, * grab the sock_error() and use that for the completion. */ - if (ret == -EBADFD || ret == -EISCONN) + if (ret == -EBADFD || ret == -EISCONN) { +get_sock_err: ret = sock_error(sock_from_file(req->file)->sk); + } } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1778,7 +1832,7 @@ out: req_set_fail(req); io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1852,4 +1906,3 @@ void io_netmsg_cache_free(const void *entry) io_vec_free(&kmsg->vec); kfree(kmsg); } -#endif diff --git a/io_uring/nop.c b/io_uring/nop.c index 28f06285fdc2..20ed0f85b1c2 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -20,7 +20,8 @@ struct io_nop { }; #define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ - IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE | \ + IORING_NOP_TW) int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -68,5 +69,10 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); - return IOU_OK; + if (nop->flags & IORING_NOP_TW) { + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); + return IOU_ISSUE_SKIP_COMPLETE; + } + return IOU_COMPLETE; } diff --git a/io_uring/notif.c b/io_uring/notif.c index 7bd92538dccb..9a6f6e92d742 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -112,6 +112,7 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; + notif->ctx = ctx; notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 489384c0438b..9568785810d9 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, @@ -333,13 +334,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -569,6 +570,10 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_prep_writev_fixed, .issue = io_write, }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -755,6 +760,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .sqe_copy = io_uring_cmd_sqe_copy, .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { @@ -815,6 +821,9 @@ const struct io_cold_def io_cold_defs[] = { .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 719a52104abe..c2f0907ed78c 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -38,6 +38,7 @@ struct io_issue_def { struct io_cold_def { const char *name; + void (*sqe_copy)(struct io_kiocb *); void (*cleanup)(struct io_kiocb *); void (*fail)(struct io_kiocb *); }; diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca..d70700e5cef8 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,6 +6,8 @@ #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> +#include <linux/pipe_fs_i.h> +#include <linux/watch_queue.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -169,7 +171,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -257,7 +259,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -300,5 +302,134 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = ret; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (slot != IORING_FILE_INDEX_ALLOC) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = ret; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_COMPLETE; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; } diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0ad..4ca2a9935abc 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/poll.c b/io_uring/poll.c index 8eb744eb9f4c..c786e587563b 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -273,8 +273,6 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) return IOU_POLL_REISSUE; } } - if (unlikely(req->cqe.res & EPOLLERR)) - req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; @@ -669,33 +667,18 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, return apoll; } -int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; struct async_poll *apoll; struct io_poll_table ipt; - __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; - if (!def->pollin && !def->pollout) - return IO_APOLL_ABORTED; + mask |= EPOLLET; if (!io_file_can_poll(req)) return IO_APOLL_ABORTED; if (!(req->flags & REQ_F_APOLL_MULTISHOT)) mask |= EPOLLONESHOT; - if (def->pollin) { - mask |= EPOLLIN | EPOLLRDNORM; - - /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ - if (req->flags & REQ_F_CLEAR_POLLIN) - mask &= ~EPOLLIN; - } else { - mask |= EPOLLOUT | EPOLLWRNORM; - } - if (def->poll_exclusive) - mask |= EPOLLEXCLUSIVE; - apoll = io_req_alloc_apoll(req, issue_flags); if (!apoll) return IO_APOLL_ABORTED; @@ -712,6 +695,31 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } +int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + __poll_t mask = POLLPRI | POLLERR; + + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; + if (!io_file_can_poll(req)) + return IO_APOLL_ABORTED; + + if (def->pollin) { + mask |= EPOLLIN | EPOLLRDNORM; + + /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ + if (req->flags & REQ_F_CLEAR_POLLIN) + mask &= ~EPOLLIN; + } else { + mask |= EPOLLOUT | EPOLLWRNORM; + } + if (def->poll_exclusive) + mask |= EPOLLEXCLUSIVE; + + return io_arm_apoll(req, issue_flags, mask); +} + /* * Returns true if we found and killed one or more poll requests */ @@ -893,7 +901,7 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); - return IOU_OK; + return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } @@ -948,5 +956,5 @@ out: } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/poll.h b/io_uring/poll.h index 27e2db2ed4ae..c8438286dfa0 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -41,6 +41,7 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags); struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); +int io_arm_apoll(struct io_kiocb *req, unsigned issue_flags, __poll_t mask); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); diff --git a/io_uring/register.c b/io_uring/register.c index cc23a4c205cd..a59589249fce 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + struct task_struct *tsk; + /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold @@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; + tsk = sqpoll_task_locked(sqd); + if (tsk) + tctx = tsk->io_uring; } } else { tctx = current->io_uring; diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index f80a77c4973f..f75f5e43fa4a 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -55,7 +55,7 @@ int __io_account_mem(struct user_struct *user, unsigned long nr_pages) return 0; } -static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); @@ -64,7 +64,7 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) atomic64_sub(nr_pages, &ctx->mm_account->pinned_vm); } -static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) +int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; @@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } static void io_release_ubuf(void *priv) @@ -109,8 +112,11 @@ static void io_release_ubuf(void *priv) struct io_mapped_ubuf *imu = priv; unsigned int i; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); + + unpin_user_folio(folio, 1); + } } static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, @@ -132,8 +138,10 @@ static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - if (!refcount_dec_and_test(&imu->refs)) - return; + if (unlikely(refcount_read(&imu->refs) > 1)) { + if (!refcount_dec_and_test(&imu->refs)) + return; + } if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); @@ -497,7 +505,7 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) @@ -685,38 +693,34 @@ static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; - int nr_folios = data->nr_folios; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; @@ -732,6 +736,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, data->nr_pages_mid = folio_nr_pages(folio); data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); /* * Check if pages are contiguous inside a folio, and all folios have @@ -810,10 +815,8 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ @@ -827,7 +830,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + node->buf = imu; ret = 0; @@ -843,6 +850,10 @@ done: if (ret) { if (imu) io_free_imu(ctx, imu); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } io_cache_free(&ctx->node_cache, node); node = ERR_PTR(ret); } @@ -1062,8 +1073,6 @@ static int io_import_fixed(int ddir, struct iov_iter *iter, size_t offset; int ret; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; ret = validate_fixed_range(buf_addr, len, imu); if (unlikely(ret)) return ret; @@ -1110,13 +1119,19 @@ inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, if (req->flags & REQ_F_BUF_NODE) return req->buf_node; + req->flags |= REQ_F_BUF_NODE; io_ring_submit_lock(ctx, issue_flags); node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); - if (node) - io_req_assign_buf_node(req, node); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; io_ring_submit_unlock(ctx, issue_flags); - return node; + return NULL; } int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, @@ -1174,6 +1189,8 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx return -EINVAL; if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) return -EOVERFLOW; + if (nbufs > IORING_MAX_REG_BUFFERS) + return -EINVAL; ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); if (ret) @@ -1324,7 +1341,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, { unsigned long folio_size = 1 << imu->folio_shift; unsigned long folio_mask = folio_size - 1; - u64 folio_addr = imu->ubuf & ~folio_mask; struct bio_vec *res_bvec = vec->bvec; size_t total_len = 0; unsigned bvec_idx = 0; @@ -1346,8 +1362,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) return -EOVERFLOW; - /* by using folio address it also accounts for bvec offset */ - offset = buf_addr - folio_addr; + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + src_bvec = imu->bvec + (offset >> imu->folio_shift); offset &= folio_mask; diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index b52242852ff3..a3ca6ba66596 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -49,6 +49,7 @@ struct io_imu_folio_data { unsigned int nr_pages_mid; unsigned int folio_shift; unsigned int nr_folios; + unsigned long first_folio_page_idx; }; bool io_rsrc_cache_init(struct io_ring_ctx *ctx); @@ -83,7 +84,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); -int io_buffer_validate(struct iovec *iov); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, struct io_imu_folio_data *data); @@ -115,36 +116,12 @@ static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, return true; } -static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) -{ - if (req->file_node) { - io_put_rsrc_node(req->ctx, req->file_node); - req->file_node = NULL; - } - if (req->flags & REQ_F_BUF_NODE) { - io_put_rsrc_node(req->ctx, req->buf_node); - req->buf_node = NULL; - } -} - -static inline void io_req_assign_rsrc_node(struct io_rsrc_node **dst_node, - struct io_rsrc_node *node) -{ - node->refs++; - *dst_node = node; -} - -static inline void io_req_assign_buf_node(struct io_kiocb *req, - struct io_rsrc_node *node) -{ - io_req_assign_rsrc_node(&req->buf_node, node); - req->flags |= REQ_F_BUF_NODE; -} - int io_files_update(struct io_kiocb *req, unsigned int issue_flags); int io_files_update_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int __io_account_mem(struct user_struct *user, unsigned long nr_pages); +int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); +void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages); static inline void __io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) diff --git a/io_uring/rw.c b/io_uring/rw.c index 039e063f7091..52a5b950b2e5 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -119,7 +119,7 @@ static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, return io_import_vec(ddir, req, io, buf, sqe_len); if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); + buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); if (!buf) return -ENOBUFS; rw->addr = (unsigned long) buf; @@ -253,16 +253,19 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io; unsigned ioprio; u64 attr_type_mask; int ret; if (io_rw_alloc_async(req)) return -ENOMEM; + io = req->async_data; rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); + io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -276,6 +279,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } rw->kiocb.dio_complete = NULL; rw->kiocb.ki_flags = 0; + rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); if (req->ctx->flags & IORING_SETUP_IOPOLL) rw->kiocb.ki_complete = io_complete_rw_iopoll; @@ -284,7 +288,7 @@ static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); - rw->flags = READ_ONCE(sqe->rw_flags); + rw->flags = (__force rwf_t) READ_ONCE(sqe->rw_flags); attr_type_mask = READ_ONCE(sqe->attr_type_mask); if (attr_type_mask) { @@ -657,7 +661,7 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, io_req_io_end(req); io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); io_req_rw_cleanup(req, issue_flags); - return IOU_OK; + return IOU_COMPLETE; } else { io_rw_done(req, ret); } diff --git a/io_uring/rw.h b/io_uring/rw.h index 81d6d9a8cf69..129a53fe5482 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -16,6 +16,8 @@ struct io_async_rw { struct iov_iter iter; struct iov_iter_state iter_state; struct iovec fast_iov; + unsigned buf_group; + /* * wpq is for buffered io, while meta fields are used with * direct io diff --git a/io_uring/splice.c b/io_uring/splice.c index 7b89bd84d486..35ce4e60b495 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -103,7 +103,7 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -144,5 +144,5 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index 03c699493b5a..a3f11349ce06 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -16,6 +16,7 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "tctx.h" #include "napi.h" #include "sqpoll.h" @@ -30,7 +31,7 @@ enum { void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(sqpoll_task_locked(sqd) == current); /* * Do the dance but not conditional clear_bit() because it'd race with @@ -46,24 +47,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(data_race(sqd->thread) == current); + struct task_struct *tsk; atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } } void io_sq_thread_stop(struct io_sq_data *sqd) { - WARN_ON_ONCE(sqd->thread == current); + struct task_struct *tsk; + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -270,7 +279,8 @@ static int io_sq_thread(void *data) /* offload context creation failed, just exit */ if (!current->io_uring) { mutex_lock(&sqd->lock); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); mutex_unlock(&sqd->lock); goto err_out; } @@ -379,7 +389,8 @@ static int io_sq_thread(void *data) io_sq_tw(&retry_list, UINT_MAX); io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); @@ -409,7 +420,6 @@ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx) __cold int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { - struct task_struct *task_to_put = NULL; int ret; /* Retain compatibility with failing for an invalid attach attempt */ @@ -484,8 +494,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err_sqpoll; } - sqd->thread = tsk; - task_to_put = get_task_struct(tsk); + mutex_lock(&sqd->lock); + rcu_assign_pointer(sqd->thread, tsk); + mutex_unlock(&sqd->lock); + + get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -495,16 +508,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; goto err; } - - if (task_to_put) - put_task_struct(task_to_put); return 0; err_sqpoll: complete(&ctx->sq_data->exited); err: io_sq_thread_finish(ctx); - if (task_to_put) - put_task_struct(task_to_put); return ret; } @@ -515,10 +523,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, int ret = -EINVAL; if (sqd) { + struct task_struct *tsk; + io_sq_thread_park(sqd); /* Don't set affinity for a dying thread */ - if (sqd->thread) - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + tsk = sqpoll_task_locked(sqd); + if (tsk) + ret = io_wq_cpu_affinity(tsk->io_uring, mask); io_sq_thread_unpark(sqd); } diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 4171666b1cf4..b83dcdec9765 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -8,7 +8,7 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct task_struct *thread; + struct task_struct __rcu *thread; struct wait_queue_head wait; unsigned sq_thread_idle; @@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); + +static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) +{ + return rcu_dereference_protected(sqd->thread, + lockdep_is_held(&sqd->lock)); +} diff --git a/io_uring/statx.c b/io_uring/statx.c index 6bc4651700a2..5111e9befbfe 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -59,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_statx_cleanup(struct io_kiocb *req) diff --git a/io_uring/sync.c b/io_uring/sync.c index 255f68c37e55..cea2d381ffd2 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index adc6e42c14df..5b66755579c0 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 2a107665230b..7f13bfa9f2b6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,6 +35,9 @@ struct io_timeout_rem { bool ltimeout; }; +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); @@ -218,7 +221,9 @@ void io_disarm_next(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; raw_spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); + if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) + link = __io_disarm_linked_timeout(req, req->link); + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); @@ -228,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req) io_fail_links(req); } -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link) +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -500,7 +505,7 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_timeout_prep(struct io_kiocb *req, diff --git a/io_uring/timeout.h b/io_uring/timeout.h index e91b32448dcf..2b7c9ad72992 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -8,19 +8,6 @@ struct io_timeout_data { u32 flags; }; -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link); - -static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) - return __io_disarm_linked_timeout(req, link); - - return NULL; -} - __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); diff --git a/io_uring/truncate.c b/io_uring/truncate.c index 62ee73d34d72..487baf23b44e 100644 --- a/io_uring/truncate.c +++ b/io_uring/truncate.c @@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) ret = do_ftruncate(req->file, ft->len, 1); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 430ed620ddfe..053bac89b6c0 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,18 +3,16 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> -#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> -#include <net/sock.h> #include <uapi/linux/io_uring.h> -#include <asm/ioctls.h> #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" +#include "poll.h" void io_cmd_cache_free(const void *entry) { @@ -28,12 +26,6 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); struct io_async_cmd *ac = req->async_data; - struct io_uring_cmd_data *cache = &ac->data; - - if (cache->op_data) { - kfree(cache->op_data); - cache->op_data = NULL; - } if (issue_flags & IO_URING_F_UNLOCKED) return; @@ -42,7 +34,7 @@ static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) io_vec_free(&ac->vec); - if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { + if (io_alloc_cache_put(&req->ctx->cmd_cache, ac)) { ioucmd->sqe = NULL; req->async_data = NULL; req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); @@ -139,6 +131,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + ioucmd->task_work_cb = task_work_cb; req->io_task_work.func = io_uring_cmd_work; __io_req_task_work_add(req, flags); @@ -161,6 +156,9 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(req->flags & REQ_F_APOLL_MULTISHOT)) + return; + io_uring_cmd_del_cancelable(ioucmd, issue_flags); if (ret < 0) @@ -184,35 +182,10 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, } EXPORT_SYMBOL_GPL(io_uring_cmd_done); -static int io_uring_cmd_prep_setup(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct io_async_cmd *ac; - - /* see io_uring_cmd_get_async_data() */ - BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); - - ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); - if (!ac) - return -ENOMEM; - ac->data.op_data = NULL; - - /* - * Unconditionally cache the SQE for now - this is only needed for - * requests that go async, but prep handlers must ensure that any - * sqe data is stable beyond prep. Since uring_cmd is special in - * that it doesn't read in per-op data, play it safe and ensure that - * any SQE data is stable beyond prep. This can later get relaxed. - */ - memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = ac->sqes; - return 0; -} - int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_async_cmd *ac; if (sqe->__pad1) return -EINVAL; @@ -226,7 +199,23 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); - return io_uring_cmd_prep_setup(req, sqe); + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); + if (!ac) + return -ENOMEM; + ioucmd->sqe = sqe; + return 0; +} + +void io_uring_cmd_sqe_copy(struct io_kiocb *req) +{ + struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + struct io_async_cmd *ac = req->async_data; + + /* Should not happen, as REQ_F_SQE_COPIED covers this */ + if (WARN_ON_ONCE(ioucmd->sqe == ac->sqes)) + return; + memcpy(ac->sqes, ioucmd->sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->sqes; } int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) @@ -262,13 +251,17 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN || ret == -EIOCBQUEUED) + if (ret == -EAGAIN) { + ioucmd->flags |= IORING_URING_CMD_REISSUE; + return ret; + } + if (ret == -EIOCBQUEUED) return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, @@ -278,6 +271,9 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); @@ -292,6 +288,9 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, struct io_async_cmd *ac = req->async_data; int ret; + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; + ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs); if (ret) return ret; @@ -308,82 +307,29 @@ void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) io_req_queue_iowq(req); } -static inline int io_uring_cmd_getsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask) { - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optlen, optname, level, err; - void __user *optval; - - level = READ_ONCE(sqe->level); - if (level != SOL_SOCKET) - return -EOPNOTSUPP; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); + struct io_kiocb *req = cmd_to_io_kiocb(cmd); + int ret; - err = do_sock_getsockopt(sock, compat, level, optname, - USER_SOCKPTR(optval), - KERNEL_SOCKPTR(&optlen)); - if (err) - return err; + if (likely(req->flags & REQ_F_APOLL_MULTISHOT)) + return 0; - /* On success, return optlen */ - return optlen; -} + req->flags |= REQ_F_APOLL_MULTISHOT; + mask &= ~EPOLLONESHOT; -static inline int io_uring_cmd_setsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - const struct io_uring_sqe *sqe = cmd->sqe; - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optname, optlen, level; - void __user *optval; - sockptr_t optval_s; - - optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); - optname = READ_ONCE(sqe->optname); - optlen = READ_ONCE(sqe->optlen); - level = READ_ONCE(sqe->level); - optval_s = USER_SOCKPTR(optval); - - return do_sock_setsockopt(sock, compat, level, optname, optval_s, - optlen); + ret = io_arm_apoll(req, issue_flags, mask); + return ret == IO_APOLL_OK ? -EIOCBQUEUED : -ECANCELED; } -#if defined(CONFIG_NET) -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]) { - struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; - - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; + struct io_kiocb *req = cmd_to_io_kiocb(cmd); - switch (cmd->cmd_op) { - case SOCKET_URING_OP_SIOCINQ: - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_SIOCOUTQ: - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_GETSOCKOPT: - return io_uring_cmd_getsockopt(sock, cmd, issue_flags); - case SOCKET_URING_OP_SETSOCKOPT: - return io_uring_cmd_setsockopt(sock, cmd, issue_flags); - default: - return -EOPNOTSUPP; - } + if (WARN_ON_ONCE(!(issue_flags & IO_URING_F_MULTISHOT))) + return false; + return io_req_post_cqe32(req, cqe); } -EXPORT_SYMBOL_GPL(io_uring_cmd_sock); -#endif diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index e6a5142c890e..041aef8a8aa3 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -4,16 +4,23 @@ #include <linux/io_uring_types.h> struct io_async_cmd { - struct io_uring_cmd_data data; struct iou_vec vec; struct io_uring_sqe sqes[2]; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_sqe_copy(struct io_kiocb *req); void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); +bool io_uring_cmd_post_mshot_cqe32(struct io_uring_cmd *cmd, + unsigned int issue_flags, + struct io_uring_cqe cqe[2]); + void io_cmd_cache_free(const void *entry); + +int io_cmd_poll_multishot(struct io_uring_cmd *cmd, + unsigned int issue_flags, __poll_t mask); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 54e69984cd8a..e07a94694397 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -323,5 +323,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/xattr.c b/io_uring/xattr.c index de5064fcae8a..322b94ff9e4b 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -109,7 +109,7 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -122,7 +122,7 @@ int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -190,7 +190,7 @@ int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -203,5 +203,5 @@ int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index fe86606b9f30..e5ff49f3425e 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -26,68 +26,270 @@ #include "zcrx.h" #include "rsrc.h" +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) { return pp->mp_priv; } -#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); -static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) { - int i; + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - for (i = 0; i < nr_mapped; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; + lockdep_assert(!area->mem.is_dmabuf); - dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); - dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); - net_mp_niov_set_dma_addr(niov, 0); + return area->mem.pages[net_iov_idx(niov)]; +} + +static int io_populate_area_dma(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, + struct sg_table *sgt, unsigned long off) +{ + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + for_each_sgtable_dma_sg(sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return -EFAULT; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } } + return 0; } -static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static void io_release_dmabuf(struct io_zcrx_mem *mem) { - guard(mutex)(&ifq->dma_lock); + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; - if (area->is_mapped) - __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); - area->is_mapped = false; + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) { + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (off) + return -EINVAL; + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; + } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size != len) { + ret = -EINVAL; + goto err; + } + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; +} + +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + return io_populate_area_dma(ifq, area, area->mem.sgt, + area->mem.dmabuf_offset); +} + +static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pages) +{ + struct folio *last_folio = NULL; + unsigned long res = 0; int i; - guard(mutex)(&ifq->dma_lock); - if (area->is_mapped) - return 0; + for (i = 0; i < nr_pages; i++) { + struct folio *folio = page_folio(pages[i]); - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; + if (folio == last_folio) + continue; + last_folio = folio; + res += 1UL << folio_order(folio); + } + return res; +} - dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); - if (dma_mapping_error(ifq->dev, dma)) - break; - if (net_mp_niov_set_dma_addr(niov, dma)) { - dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); - break; - } +static int io_import_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages, ret; + + if (area_reg->dmabuf_fd) + return -EINVAL; + if (!area_reg->addr) + return -EFAULT; + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = sg_alloc_table_from_pages(&mem->page_sg_table, pages, nr_pages, + 0, nr_pages << PAGE_SHIFT, + GFP_KERNEL_ACCOUNT); + if (ret) { + unpin_user_pages(pages, nr_pages); + return ret; } - if (i != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, i); + mem->account_pages = io_count_account_pages(pages, nr_pages); + ret = io_account_mem(ifq->ctx, mem->account_pages); + if (ret < 0) + mem->account_pages = 0; + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return ret; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + sg_free_table(&mem->page_sg_table); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) +{ + int i; + + guard(mutex)(&ifq->dma_lock); + if (!area->is_mapped) + return; + area->is_mapped = false; + + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); + + if (area->mem.is_dmabuf) { + io_release_dmabuf(&area->mem); + } else { + dma_unmap_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); } +} - area->is_mapped = true; - return 0; +static unsigned io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + int ret; + + ret = dma_map_sgtable(ifq->dev, &area->mem.page_sg_table, + DMA_FROM_DEVICE, IO_DMA_ATTR); + if (ret < 0) + return ret; + return io_populate_area_dma(ifq, area, &area->mem.page_sg_table, 0); +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + int ret; + + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + + if (area->mem.is_dmabuf) + ret = io_zcrx_map_area_dmabuf(ifq, area); + else + ret = io_zcrx_map_area_umem(ifq, area); + + if (ret == 0) + area->is_mapped = true; + return ret; } static void io_zcrx_sync_for_device(const struct page_pool *pool, @@ -118,13 +320,6 @@ struct io_zcrx_args { static const struct memory_provider_ops io_uring_pp_zc_ops; -static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) -{ - struct net_iov_area *owner = net_iov_owner(niov); - - return container_of(owner, struct io_zcrx_area, nia); -} - static inline atomic_t *io_get_user_counter(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); @@ -147,17 +342,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - return area->pages[net_iov_idx(niov)]; -} - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -167,12 +357,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, - IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; @@ -180,7 +372,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -188,53 +380,48 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); + + if (area->mem.account_pages) + io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); - if (area->pages) { - unpin_user_pages(area->pages, area->nr_folios); - kvfree(area->pages); - } kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages, nr_iovs; - struct iovec iov; + unsigned nr_iovs; + int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; - iov.iov_base = u64_to_user_ptr(area_reg->addr); - iov.iov_len = area_reg->len; - ret = io_buffer_validate(&iov); - if (ret) - return ret; - ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; + area->ifq = ifq; - area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, - &nr_pages); - if (IS_ERR(area->pages)) { - ret = PTR_ERR(area->pages); - area->pages = NULL; + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) goto err; - } - area->nr_folios = nr_iovs = nr_pages; + + nr_iovs = area->mem.size >> PAGE_SHIFT; area->nia.num_niovs = nr_iovs; + ret = -ENOMEM; area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) @@ -245,9 +432,6 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, if (!area->freelist) goto err; - for (i = 0; i < nr_iovs; i++) - area->freelist[i] = i; - area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->user_refs) @@ -259,10 +443,10 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, niov->owner = &area->nia; area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); + niov->type = NET_IOV_IOURING; } area->free_count = nr_iovs; - area->ifq = ifq; /* we're only supporting one area per ifq for now */ area->area_id = 0; area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; @@ -341,6 +525,16 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + + lockdep_assert_held(&ctx->mmap_lock); + + return ifq ? &ifq->region : NULL; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -350,6 +544,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -362,8 +557,6 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) @@ -386,29 +579,37 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + ifq->rq_entries = reg.rq_entries; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); - if (ret) - goto err; + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } - ret = io_zcrx_create_area(ifq, &ifq->area, &area); + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; - ifq->rq_entries = reg.rq_entries; - - ret = -ENODEV; ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, &ifq->netdev_tracker, GFP_KERNEL); - if (!ifq->netdev) + if (!ifq->netdev) { + ret = -ENODEV; goto err; + } ifq->dev = ifq->netdev->dev.parent; - ret = -EOPNOTSUPP; - if (!ifq->dev) + if (!ifq->dev) { + ret = -EOPNOTSUPP; goto err; + } get_device(ifq->dev); + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); @@ -419,6 +620,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -426,24 +635,35 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } + + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -500,12 +720,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) @@ -653,10 +876,7 @@ static int io_pp_zc_init(struct page_pool *pp) static void io_pp_zc_destroy(struct page_pool *pp) { struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - struct io_zcrx_area *area = ifq->area; - if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) - return; percpu_ref_put(&ifq->ctx->refs); } @@ -734,20 +954,66 @@ static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) return niov; } +struct io_copy_cache { + struct page *page; + unsigned long offset; + size_t size; +}; + +static ssize_t io_copy_page(struct io_copy_cache *cc, struct page *src_page, + unsigned int src_offset, size_t len) +{ + size_t copied = 0; + + len = min(len, cc->size); + + while (len) { + void *src_addr, *dst_addr; + struct page *dst_page = cc->page; + unsigned dst_offset = cc->offset; + size_t n = len; + + if (folio_test_partial_kmap(page_folio(dst_page)) || + folio_test_partial_kmap(page_folio(src_page))) { + dst_page = nth_page(dst_page, dst_offset / PAGE_SIZE); + dst_offset = offset_in_page(dst_offset); + src_page = nth_page(src_page, src_offset / PAGE_SIZE); + src_offset = offset_in_page(src_offset); + n = min(PAGE_SIZE - src_offset, PAGE_SIZE - dst_offset); + n = min(n, len); + } + + dst_addr = kmap_local_page(dst_page) + dst_offset; + src_addr = kmap_local_page(src_page) + src_offset; + + memcpy(dst_addr, src_addr, n); + + kunmap_local(src_addr); + kunmap_local(dst_addr); + + cc->size -= n; + cc->offset += n; + len -= n; + copied += n; + } + return copied; +} + static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, - void *src_base, struct page *src_page, - unsigned int src_offset, size_t len) + struct page *src_page, unsigned int src_offset, + size_t len) { struct io_zcrx_area *area = ifq->area; size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { - size_t copy_size = min_t(size_t, PAGE_SIZE, len); - const int dst_off = 0; + struct io_copy_cache cc; struct net_iov *niov; - struct page *dst_page; - void *dst_addr; + size_t n; niov = io_zcrx_alloc_fallback(area); if (!niov) { @@ -755,27 +1021,22 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, break; } - dst_page = io_zcrx_iov_page(niov); - dst_addr = kmap_local_page(dst_page); - if (src_page) - src_base = kmap_local_page(src_page); + cc.page = io_zcrx_iov_page(niov); + cc.offset = 0; + cc.size = PAGE_SIZE; - memcpy(dst_addr, src_base + src_offset, copy_size); + n = io_copy_page(&cc, src_page, src_offset, len); - if (src_page) - kunmap_local(src_base); - kunmap_local(dst_addr); - - if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { + if (!io_zcrx_queue_cqe(req, niov, ifq, 0, n)) { io_zcrx_return_niov(niov); ret = -ENOSPC; break; } io_zcrx_get_niov_uref(niov); - src_offset += copy_size; - len -= copy_size; - copied += copy_size; + src_offset += n; + len -= n; + copied += n; } return copied ? copied : ret; @@ -785,19 +1046,8 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct page *page = skb_frag_page(frag); - u32 p_off, p_len, t, copied = 0; - int ret = 0; - off += skb_frag_off(frag); - - skb_frag_foreach_page(frag, off, len, - page, p_off, p_len, t) { - ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); - if (ret < 0) - return copied ? copied : ret; - copied += ret; - } - return copied; + return io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len); } static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, @@ -809,7 +1059,7 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (niov->pp->mp_ops != &io_uring_pp_zc_ops || + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; @@ -854,8 +1104,9 @@ io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, size_t to_copy; to_copy = min_t(size_t, skb_headlen(skb) - offset, len); - copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, - offset, to_copy); + copied = io_zcrx_copy_chunk(req, ifq, virt_to_page(skb->data), + offset_in_page(skb->data) + offset, + to_copy); if (copied < 0) { ret = copied; goto out; diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h index f2bc811f022c..109c4ca36434 100644 --- a/io_uring/zcrx.h +++ b/io_uring/zcrx.h @@ -3,10 +3,26 @@ #define IOU_ZC_RX_H #include <linux/io_uring_types.h> +#include <linux/dma-buf.h> #include <linux/socket.h> #include <net/page_pool/types.h> #include <net/net_trackers.h> +struct io_zcrx_mem { + unsigned long size; + bool is_dmabuf; + + struct page **pages; + unsigned long nr_folios; + struct sg_table page_sg_table; + unsigned long account_pages; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; +}; + struct io_zcrx_area { struct net_iov_area nia; struct io_zcrx_ifq *ifq; @@ -14,13 +30,13 @@ struct io_zcrx_area { bool is_mapped; u16 area_id; - struct page **pages; - unsigned long nr_folios; /* freelist */ spinlock_t freelist_lock ____cacheline_aligned_in_smp; u32 free_count; u32 *freelist; + + struct io_zcrx_mem mem; }; struct io_zcrx_ifq { @@ -39,6 +55,7 @@ struct io_zcrx_ifq { netdevice_tracker netdev_tracker; spinlock_t lock; struct mutex dma_lock; + struct io_mapped_region region; }; #if defined(CONFIG_IO_URING_ZCRX) @@ -49,6 +66,8 @@ void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, struct socket *sock, unsigned int flags, unsigned issue_flags, unsigned int *len); +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id); #else static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) @@ -67,6 +86,11 @@ static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, { return -EOPNOTSUPP; } +static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + return NULL; +} #endif int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); |