diff options
Diffstat (limited to 'io_uring')
63 files changed, 6345 insertions, 3708 deletions
diff --git a/io_uring/Kconfig b/io_uring/Kconfig new file mode 100644 index 000000000000..4b949c42c0bf --- /dev/null +++ b/io_uring/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# io_uring configuration +# + +config IO_URING_ZCRX + def_bool y + depends on IO_URING + depends on PAGE_POOL + depends on INET + depends on NET_RX_BUSY_POLL diff --git a/io_uring/Makefile b/io_uring/Makefile index 53167bef37d7..d97c6b51d584 100644 --- a/io_uring/Makefile +++ b/io_uring/Makefile @@ -7,13 +7,17 @@ GCOV_PROFILE := y endif obj-$(CONFIG_IO_URING) += io_uring.o opdef.o kbuf.o rsrc.o notif.o \ - tctx.o filetable.o rw.o net.o poll.o \ + tctx.o filetable.o rw.o poll.o \ eventfd.o uring_cmd.o openclose.o \ sqpoll.o xattr.o nop.o fs.o splice.o \ sync.o msg_ring.o advise.o openclose.o \ - epoll.o statx.o timeout.o fdinfo.o \ - cancel.o waitid.o register.o \ - truncate.o memmap.o + statx.o timeout.o cancel.o \ + waitid.o register.o truncate.o \ + memmap.o alloc_cache.o +obj-$(CONFIG_IO_URING_ZCRX) += zcrx.o obj-$(CONFIG_IO_WQ) += io-wq.o obj-$(CONFIG_FUTEX) += futex.o -obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_EPOLL) += epoll.o +obj-$(CONFIG_NET_RX_BUSY_POLL) += napi.o +obj-$(CONFIG_NET) += net.o cmd_net.o +obj-$(CONFIG_PROC_FS) += fdinfo.o diff --git a/io_uring/advise.c b/io_uring/advise.c index cb7b881665e5..0073f74e3658 100644 --- a/io_uring/advise.c +++ b/io_uring/advise.c @@ -58,7 +58,7 @@ int io_madvise(struct io_kiocb *req, unsigned int issue_flags) ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; #else return -EOPNOTSUPP; #endif @@ -104,5 +104,5 @@ int io_fadvise(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/alloc_cache.c b/io_uring/alloc_cache.c new file mode 100644 index 000000000000..58423888b736 --- /dev/null +++ b/io_uring/alloc_cache.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "alloc_cache.h" + +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)) +{ + void *entry; + + if (!cache->entries) + return; + + while ((entry = io_alloc_cache_get(cache)) != NULL) + free(entry); + + kvfree(cache->entries); + cache->entries = NULL; +} + +/* returns false if the cache was initialized properly */ +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes) +{ + cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); + if (!cache->entries) + return true; + + cache->nr_cached = 0; + cache->max_cached = max_nr; + cache->elem_size = size; + cache->init_clear = init_bytes; + return false; +} + +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp) +{ + void *obj; + + obj = kmalloc(cache->elem_size, gfp); + if (obj && cache->init_clear) + memset(obj, 0, cache->init_clear); + return obj; +} diff --git a/io_uring/alloc_cache.h b/io_uring/alloc_cache.h index b7a38a2069cf..d33ce159ef33 100644 --- a/io_uring/alloc_cache.h +++ b/io_uring/alloc_cache.h @@ -1,11 +1,21 @@ #ifndef IOU_ALLOC_CACHE_H #define IOU_ALLOC_CACHE_H +#include <linux/io_uring_types.h> + /* * Don't allow the cache to grow beyond this size. */ #define IO_ALLOC_CACHE_MAX 128 +void io_alloc_cache_free(struct io_alloc_cache *cache, + void (*free)(const void *)); +bool io_alloc_cache_init(struct io_alloc_cache *cache, + unsigned max_nr, unsigned int size, + unsigned int init_bytes); + +void *io_cache_alloc_new(struct io_alloc_cache *cache, gfp_t gfp); + static inline bool io_alloc_cache_put(struct io_alloc_cache *cache, void *entry) { @@ -23,39 +33,36 @@ static inline void *io_alloc_cache_get(struct io_alloc_cache *cache) if (cache->nr_cached) { void *entry = cache->entries[--cache->nr_cached]; + /* + * If KASAN is enabled, always clear the initial bytes that + * must be zeroed post alloc, in case any of them overlap + * with KASAN storage. + */ +#if defined(CONFIG_KASAN) kasan_mempool_unpoison_object(entry, cache->elem_size); + if (cache->init_clear) + memset(entry, 0, cache->init_clear); +#endif return entry; } return NULL; } -/* returns false if the cache was initialized properly */ -static inline bool io_alloc_cache_init(struct io_alloc_cache *cache, - unsigned max_nr, size_t size) +static inline void *io_cache_alloc(struct io_alloc_cache *cache, gfp_t gfp) { - cache->entries = kvmalloc_array(max_nr, sizeof(void *), GFP_KERNEL); - if (cache->entries) { - cache->nr_cached = 0; - cache->max_cached = max_nr; - cache->elem_size = size; - return false; - } - return true; + void *obj; + + obj = io_alloc_cache_get(cache); + if (obj) + return obj; + return io_cache_alloc_new(cache, gfp); } -static inline void io_alloc_cache_free(struct io_alloc_cache *cache, - void (*free)(const void *)) +static inline void io_cache_free(struct io_alloc_cache *cache, void *obj) { - void *entry; - - if (!cache->entries) - return; - - while ((entry = io_alloc_cache_get(cache)) != NULL) - free(entry); - - kvfree(cache->entries); - cache->entries = NULL; + if (!io_alloc_cache_put(cache, obj)) + kfree(obj); } + #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index a6e58a20efdd..6d57602304df 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -205,7 +205,7 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) .opcode = cancel->opcode, .seq = atomic_inc_return(&req->ctx->cancel_seq), }; - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; int ret; if (cd.flags & IORING_ASYNC_CANCEL_FD) { @@ -229,17 +229,7 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; -} - -void init_hash_table(struct io_hash_table *table, unsigned size) -{ - unsigned int i; - - for (i = 0; i < size; i++) { - spin_lock_init(&table->hbs[i].lock); - INIT_HLIST_HEAD(&table->hbs[i].list); - } + return IOU_COMPLETE; } static int __io_sync_cancel(struct io_uring_task *tctx, @@ -250,10 +240,12 @@ static int __io_sync_cancel(struct io_uring_task *tctx, /* fixed must be grabbed every time since we drop the uring_lock */ if ((cd->flags & IORING_ASYNC_CANCEL_FD) && (cd->flags & IORING_ASYNC_CANCEL_FD_FIXED)) { - if (unlikely(fd >= ctx->nr_user_files)) + struct io_rsrc_node *node; + + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (unlikely(!node)) return -EBADF; - fd = array_index_nospec(fd, ctx->nr_user_files); - cd->file = io_file_from_index(&ctx->file_table, fd); + cd->file = io_slot_file(node); if (!cd->file) return -EBADF; } @@ -349,3 +341,45 @@ out: fput(file); return ret; } + +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + bool found = false; + + lockdep_assert_held(&ctx->uring_lock); + + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_match_task_safe(req, tctx, cancel_all)) + continue; + hlist_del_init(&req->hash_node); + if (cancel(req)) + found = true; + } + + return found; +} + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)) +{ + struct hlist_node *tmp; + struct io_kiocb *req; + int nr = 0; + + io_ring_submit_lock(ctx, issue_flags); + hlist_for_each_entry_safe(req, tmp, list, hash_node) { + if (!io_cancel_req_match(req, cd)) + continue; + if (cancel(req)) + nr++; + if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) + break; + } + io_ring_submit_unlock(ctx, issue_flags); + return nr ?: -ENOENT; +} diff --git a/io_uring/cancel.h b/io_uring/cancel.h index b33995e00ba9..43e9bb74e9d1 100644 --- a/io_uring/cancel.h +++ b/io_uring/cancel.h @@ -20,11 +20,18 @@ int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags); int io_try_cancel(struct io_uring_task *tctx, struct io_cancel_data *cd, unsigned int issue_flags); -void init_hash_table(struct io_hash_table *table, unsigned size); int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg); bool io_cancel_req_match(struct io_kiocb *req, struct io_cancel_data *cd); +bool io_cancel_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + struct hlist_head *list, bool cancel_all, + bool (*cancel)(struct io_kiocb *)); + +int io_cancel_remove(struct io_ring_ctx *ctx, struct io_cancel_data *cd, + unsigned int issue_flags, struct hlist_head *list, + bool (*cancel)(struct io_kiocb *)); + static inline bool io_cancel_match_sequence(struct io_kiocb *req, int sequence) { if (req->cancel_seq_set && sequence == req->work.cancel_seq) diff --git a/io_uring/cmd_net.c b/io_uring/cmd_net.c new file mode 100644 index 000000000000..e99170c7d41a --- /dev/null +++ b/io_uring/cmd_net.c @@ -0,0 +1,83 @@ +#include <asm/ioctls.h> +#include <linux/io_uring/net.h> +#include <net/sock.h> + +#include "uring_cmd.h" + +static inline int io_uring_cmd_getsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optlen, optname, level, err; + void __user *optval; + + level = READ_ONCE(sqe->level); + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + + err = do_sock_getsockopt(sock, compat, level, optname, + USER_SOCKPTR(optval), + KERNEL_SOCKPTR(&optlen)); + if (err) + return err; + + /* On success, return optlen */ + return optlen; +} + +static inline int io_uring_cmd_setsockopt(struct socket *sock, + struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + const struct io_uring_sqe *sqe = cmd->sqe; + bool compat = !!(issue_flags & IO_URING_F_COMPAT); + int optname, optlen, level; + void __user *optval; + sockptr_t optval_s; + + optval = u64_to_user_ptr(READ_ONCE(sqe->optval)); + optname = READ_ONCE(sqe->optname); + optlen = READ_ONCE(sqe->optlen); + level = READ_ONCE(sqe->level); + optval_s = USER_SOCKPTR(optval); + + return do_sock_setsockopt(sock, compat, level, optname, optval_s, + optlen); +} + +int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct socket *sock = cmd->file->private_data; + struct sock *sk = sock->sk; + struct proto *prot = READ_ONCE(sk->sk_prot); + int ret, arg = 0; + + if (!prot || !prot->ioctl) + return -EOPNOTSUPP; + + switch (cmd->cmd_op) { + case SOCKET_URING_OP_SIOCINQ: + ret = prot->ioctl(sk, SIOCINQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_SIOCOUTQ: + ret = prot->ioctl(sk, SIOCOUTQ, &arg); + if (ret) + return ret; + return arg; + case SOCKET_URING_OP_GETSOCKOPT: + return io_uring_cmd_getsockopt(sock, cmd, issue_flags); + case SOCKET_URING_OP_SETSOCKOPT: + return io_uring_cmd_setsockopt(sock, cmd, issue_flags); + default: + return -EOPNOTSUPP; + } +} +EXPORT_SYMBOL_GPL(io_uring_cmd_sock); diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 89bff2068a19..8d4610246ba0 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -12,7 +12,6 @@ #include "io_uring.h" #include "epoll.h" -#if defined(CONFIG_EPOLL) struct io_epoll { struct file *file; int epfd; @@ -21,6 +20,12 @@ struct io_epoll { struct epoll_event event; }; +struct io_epoll_wait { + struct file *file; + int maxevents; + struct epoll_event __user *events; +}; + int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_epoll *epoll = io_kiocb_to_cmd(req, struct io_epoll); @@ -56,6 +61,32 @@ int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + + if (sqe->off || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + + iew->maxevents = READ_ONCE(sqe->len); + iew->events = u64_to_user_ptr(READ_ONCE(sqe->addr)); + return 0; +} + +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_epoll_wait *iew = io_kiocb_to_cmd(req, struct io_epoll_wait); + int ret; + + ret = epoll_sendevents(req->file, iew->events, iew->maxevents); + if (ret == 0) + return -EAGAIN; + if (ret < 0) + req_set_fail(req); + + io_req_set_res(req, ret, 0); + return IOU_COMPLETE; } -#endif diff --git a/io_uring/epoll.h b/io_uring/epoll.h index 870cce11ba98..4111997c360b 100644 --- a/io_uring/epoll.h +++ b/io_uring/epoll.h @@ -3,4 +3,6 @@ #if defined(CONFIG_EPOLL) int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags); +int io_epoll_wait_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_epoll_wait(struct io_kiocb *req, unsigned int issue_flags); #endif diff --git a/io_uring/eventfd.c b/io_uring/eventfd.c index e37fddd5d9ce..78f8ab7db104 100644 --- a/io_uring/eventfd.c +++ b/io_uring/eventfd.c @@ -13,10 +13,12 @@ struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; - unsigned int eventfd_async: 1; - struct rcu_head rcu; + unsigned int eventfd_async; + /* protected by ->completion_lock */ + unsigned last_cq_tail; refcount_t refs; atomic_t ops; + struct rcu_head rcu; }; enum { @@ -31,77 +33,83 @@ static void io_eventfd_free(struct rcu_head *rcu) kfree(ev_fd); } +static void io_eventfd_put(struct io_ev_fd *ev_fd) +{ + if (refcount_dec_and_test(&ev_fd->refs)) + call_rcu(&ev_fd->rcu, io_eventfd_free); +} + static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + io_eventfd_put(ev_fd); +} - if (refcount_dec_and_test(&ev_fd->refs)) - io_eventfd_free(rcu); +/* + * Returns true if the caller should put the ev_fd reference, false if not. + */ +static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) +{ + if (eventfd_signal_allowed()) { + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); + return true; + } + if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { + call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); + return false; + } + return true; } -void io_eventfd_signal(struct io_ring_ctx *ctx) +/* + * Trigger if eventfd_async isn't set, or if it's set and the caller is + * an async worker. + */ +static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { - struct io_ev_fd *ev_fd = NULL; + return !ev_fd->eventfd_async || io_wq_current_is_worker(); +} + +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) +{ + bool skip = false; + struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) return; guard(rcu)(); - - /* - * rcu_dereference ctx->io_ev_fd once and use it for both for checking - * and eventfd_signal - */ ev_fd = rcu_dereference(ctx->io_ev_fd); - /* - * Check again if ev_fd exists incase an io_eventfd_unregister call + * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ - if (unlikely(!ev_fd)) + if (!ev_fd) return; - if (!refcount_inc_not_zero(&ev_fd->refs)) + if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) return; - if (ev_fd->eventfd_async && !io_wq_current_is_worker()) - goto out; - if (likely(eventfd_signal_allowed())) { - eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); - } else { - if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { - call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); - return; - } + if (cqe_event) { + /* + * Eventfd should only get triggered when at least one event + * has been posted. Some applications rely on the eventfd + * notification count only changing IFF a new CQE has been + * added to the CQ ring. There's no dependency on 1:1 + * relationship between how many times this function is called + * (and hence the eventfd count) and number of CQEs posted to + * the CQ ring. + */ + spin_lock(&ctx->completion_lock); + skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; + spin_unlock(&ctx->completion_lock); } -out: - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); -} - -void io_eventfd_flush_signal(struct io_ring_ctx *ctx) -{ - bool skip; - - spin_lock(&ctx->completion_lock); - - /* - * Eventfd should only get triggered when at least one event has been - * posted. Some applications rely on the eventfd notification count - * only changing IFF a new CQE has been added to the CQ ring. There's - * no depedency on 1:1 relationship between how many times this - * function is called (and hence the eventfd count) and number of CQEs - * posted to the CQ ring. - */ - skip = ctx->cached_cq_tail == ctx->evfd_last_cq_tail; - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; - spin_unlock(&ctx->completion_lock); - if (skip) - return; - io_eventfd_signal(ctx); + if (skip || __io_eventfd_signal(ev_fd)) + io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, @@ -132,7 +140,7 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, } spin_lock(&ctx->completion_lock); - ctx->evfd_last_cq_tail = ctx->cached_cq_tail; + ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; @@ -152,8 +160,7 @@ int io_eventfd_unregister(struct io_ring_ctx *ctx) if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); - if (refcount_dec_and_test(&ev_fd->refs)) - call_rcu(&ev_fd->rcu, io_eventfd_free); + io_eventfd_put(ev_fd); return 0; } diff --git a/io_uring/eventfd.h b/io_uring/eventfd.h index d394f49c6321..e2f1985c2cf9 100644 --- a/io_uring/eventfd.h +++ b/io_uring/eventfd.h @@ -4,5 +4,4 @@ int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async); int io_eventfd_unregister(struct io_ring_ctx *ctx); -void io_eventfd_flush_signal(struct io_ring_ctx *ctx); -void io_eventfd_signal(struct io_ring_ctx *ctx); +void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event); diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index 6b1247664b35..9798d6fb4ec7 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -15,44 +15,48 @@ #include "cancel.h" #include "rsrc.h" -#ifdef CONFIG_PROC_FS -static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, - const struct cred *cred) +#ifdef CONFIG_NET_RX_BUSY_POLL +static __cold void common_tracking_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m, + const char *tracking_strategy) +{ + seq_puts(m, "NAPI:\tenabled\n"); + seq_printf(m, "napi tracking:\t%s\n", tracking_strategy); + seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); + if (ctx->napi_prefer_busy_poll) + seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); + else + seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); +} + +static __cold void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) { - struct user_namespace *uns = seq_user_ns(m); - struct group_info *gi; - kernel_cap_t cap; - int g; - - seq_printf(m, "%5d\n", id); - seq_put_decimal_ull(m, "\tUid:\t", from_kuid_munged(uns, cred->uid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->euid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->suid)); - seq_put_decimal_ull(m, "\t\t", from_kuid_munged(uns, cred->fsuid)); - seq_put_decimal_ull(m, "\n\tGid:\t", from_kgid_munged(uns, cred->gid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->egid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->sgid)); - seq_put_decimal_ull(m, "\t\t", from_kgid_munged(uns, cred->fsgid)); - seq_puts(m, "\n\tGroups:\t"); - gi = cred->group_info; - for (g = 0; g < gi->ngroups; g++) { - seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(uns, gi->gid[g])); + unsigned int mode = READ_ONCE(ctx->napi_track_mode); + + switch (mode) { + case IO_URING_NAPI_TRACKING_INACTIVE: + seq_puts(m, "NAPI:\tdisabled\n"); + break; + case IO_URING_NAPI_TRACKING_DYNAMIC: + common_tracking_show_fdinfo(ctx, m, "dynamic"); + break; + case IO_URING_NAPI_TRACKING_STATIC: + common_tracking_show_fdinfo(ctx, m, "static"); + break; + default: + seq_printf(m, "NAPI:\tunknown mode (%u)\n", mode); } - seq_puts(m, "\n\tCapEff:\t"); - cap = cred->cap_effective; - seq_put_hex_ll(m, NULL, cap.val, 16); - seq_putc(m, '\n'); - return 0; } +#else +static inline void napi_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) +{ +} +#endif -/* - * Caller holds a reference to the file already, we don't need to do - * anything else to get an extra reference. - */ -__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { - struct io_ring_ctx *ctx = file->private_data; struct io_overflow_cqe *ocqe; struct io_rings *r = ctx->rings; struct rusage sq_usage; @@ -66,7 +70,6 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) unsigned int sq_entries, cq_entries; int sq_pid = -1, sq_cpu = -1; u64 sq_total_time = 0, sq_work_time = 0; - bool has_lock; unsigned int i; if (ctx->flags & IORING_SETUP_CQE32) @@ -83,11 +86,11 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); seq_printf(m, "CqMask:\t0x%x\n", cq_mask); seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { @@ -136,28 +139,28 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "\n"); } - /* - * Avoid ABBA deadlock between the seq lock and the io_uring mutex, - * since fdinfo case grabs it in the opposite direction of normal use - * cases. If we fail to get the lock, we just don't iterate any - * structures that could be going away outside the io_uring mutex. - */ - has_lock = mutex_trylock(&ctx->uring_lock); - - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { + if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sq = ctx->sq_data; + struct task_struct *tsk; + rcu_read_lock(); + tsk = rcu_dereference(sq->thread); /* * sq->thread might be NULL if we raced with the sqpoll * thread termination. */ - if (sq->thread) { + if (tsk) { + get_task_struct(tsk); + rcu_read_unlock(); + getrusage(tsk, RUSAGE_SELF, &sq_usage); + put_task_struct(tsk); sq_pid = sq->task_pid; sq_cpu = sq->sq_cpu; - getrusage(sq->thread, RUSAGE_SELF, &sq_usage); sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000 + sq_usage.ru_stime.tv_usec); sq_work_time = sq->work_time; + } else { + rcu_read_unlock(); } } @@ -165,52 +168,40 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) seq_printf(m, "SqThreadCpu:\t%d\n", sq_cpu); seq_printf(m, "SqTotalTime:\t%llu\n", sq_total_time); seq_printf(m, "SqWorkTime:\t%llu\n", sq_work_time); - seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files); - for (i = 0; has_lock && i < ctx->nr_user_files; i++) { - struct file *f = io_file_from_index(&ctx->file_table, i); - - if (f) - seq_printf(m, "%5u: %s\n", i, file_dentry(f)->d_iname); + seq_printf(m, "UserFiles:\t%u\n", ctx->file_table.data.nr); + for (i = 0; i < ctx->file_table.data.nr; i++) { + struct file *f = NULL; + + if (ctx->file_table.data.nodes[i]) + f = io_slot_file(ctx->file_table.data.nodes[i]); + if (f) { + seq_printf(m, "%5u: ", i); + seq_file_path(m, f, " \t\n\\"); + seq_puts(m, "\n"); + } + } + seq_printf(m, "UserBufs:\t%u\n", ctx->buf_table.nr); + for (i = 0; i < ctx->buf_table.nr; i++) { + struct io_mapped_ubuf *buf = NULL; + + if (ctx->buf_table.nodes[i]) + buf = ctx->buf_table.nodes[i]->buf; + if (buf) + seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); else seq_printf(m, "%5u: <none>\n", i); } - seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs); - for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *buf = ctx->user_bufs[i]; - - seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len); - } - if (has_lock && !xa_empty(&ctx->personalities)) { - unsigned long index; - const struct cred *cred; - - seq_printf(m, "Personalities:\n"); - xa_for_each(&ctx->personalities, index, cred) - io_uring_show_cred(m, index, cred); - } seq_puts(m, "PollList:\n"); for (i = 0; i < (1U << ctx->cancel_table.hash_bits); i++) { struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - struct io_hash_bucket *hbl = &ctx->cancel_table_locked.hbs[i]; struct io_kiocb *req; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); - spin_unlock(&hb->lock); - - if (!has_lock) - continue; - hlist_for_each_entry(req, &hbl->list, hash_node) - seq_printf(m, " op=%d, task_works=%d\n", req->opcode, - task_work_pending(req->task)); + task_work_pending(req->tctx->task)); } - if (has_lock) - mutex_unlock(&ctx->uring_lock); - seq_puts(m, "CqOverflowList:\n"); spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { @@ -221,18 +212,24 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) } spin_unlock(&ctx->completion_lock); + napi_show_fdinfo(ctx, m); +} -#ifdef CONFIG_NET_RX_BUSY_POLL - if (ctx->napi_enabled) { - seq_puts(m, "NAPI:\tenabled\n"); - seq_printf(m, "napi_busy_poll_dt:\t%llu\n", ctx->napi_busy_poll_dt); - if (ctx->napi_prefer_busy_poll) - seq_puts(m, "napi_prefer_busy_poll:\ttrue\n"); - else - seq_puts(m, "napi_prefer_busy_poll:\tfalse\n"); - } else { - seq_puts(m, "NAPI:\tdisabled\n"); +/* + * Caller holds a reference to the file already, we don't need to do + * anything else to get an extra reference. + */ +__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + /* + * Avoid ABBA deadlock between the seq lock and the io_uring mutex, + * since fdinfo case grabs it in the opposite direction of normal use + * cases. + */ + if (mutex_trylock(&ctx->uring_lock)) { + __io_uring_show_fdinfo(ctx, m); + mutex_unlock(&ctx->uring_lock); } -#endif } -#endif diff --git a/io_uring/filetable.c b/io_uring/filetable.c index 997c56d32ee6..a21660e3145a 100644 --- a/io_uring/filetable.c +++ b/io_uring/filetable.c @@ -36,27 +36,22 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx) return -ENFILE; } -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, + unsigned nr_files) { - table->files = kvcalloc(nr_files, sizeof(table->files[0]), - GFP_KERNEL_ACCOUNT); - if (unlikely(!table->files)) + if (io_rsrc_data_alloc(&table->data, nr_files)) return false; - table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT); - if (unlikely(!table->bitmap)) { - kvfree(table->files); - return false; - } - - return true; + if (table->bitmap) + return true; + io_rsrc_data_free(ctx, &table->data); + return false; } -void io_free_file_tables(struct io_file_table *table) +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table) { - kvfree(table->files); + io_rsrc_data_free(ctx, &table->data); bitmap_free(table->bitmap); - table->files = NULL; table->bitmap = NULL; } @@ -64,32 +59,24 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file, u32 slot_index) __must_hold(&req->ctx->uring_lock) { - struct io_fixed_file *file_slot; - int ret; + struct io_rsrc_node *node; if (io_is_uring_fops(file)) return -EBADF; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (slot_index >= ctx->nr_user_files) + if (slot_index >= ctx->file_table.data.nr) return -EINVAL; - slot_index = array_index_nospec(slot_index, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - - if (file_slot->file_ptr) { - ret = io_queue_rsrc_removal(ctx->file_data, slot_index, - io_slot_file(file_slot)); - if (ret) - return ret; + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) + return -ENOMEM; - file_slot->file_ptr = 0; - } else { + if (!io_reset_rsrc_node(ctx, &ctx->file_table.data, slot_index)) io_file_bitmap_set(&ctx->file_table, slot_index); - } - *io_get_tag_slot(ctx->file_data, slot_index) = 0; - io_fixed_file_set(file_slot, file); + ctx->file_table.data.nodes[slot_index] = node; + io_fixed_file_set(node, file); return 0; } @@ -134,25 +121,17 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset) { - struct io_fixed_file *file_slot; - int ret; + struct io_rsrc_node *node; - if (unlikely(!ctx->file_data)) + if (unlikely(!ctx->file_table.data.nr)) return -ENXIO; - if (offset >= ctx->nr_user_files) + if (offset >= ctx->file_table.data.nr) return -EINVAL; - offset = array_index_nospec(offset, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, offset); - if (!file_slot->file_ptr) + node = io_rsrc_node_lookup(&ctx->file_table.data, offset); + if (!node) return -EBADF; - - ret = io_queue_rsrc_removal(ctx->file_data, offset, - io_slot_file(file_slot)); - if (ret) - return ret; - - file_slot->file_ptr = 0; + io_reset_rsrc_node(ctx, &ctx->file_table.data, offset); io_file_bitmap_clear(&ctx->file_table, offset); return 0; } @@ -167,7 +146,7 @@ int io_register_file_alloc_range(struct io_ring_ctx *ctx, return -EFAULT; if (check_add_overflow(range.off, range.len, &end)) return -EOVERFLOW; - if (range.resv || end > ctx->nr_user_files) + if (range.resv || end > ctx->file_table.data.nr) return -EINVAL; io_file_table_set_alloc_range(ctx, range.off, range.len); diff --git a/io_uring/filetable.h b/io_uring/filetable.h index b2435c4dca1f..7717ea9efd0e 100644 --- a/io_uring/filetable.h +++ b/io_uring/filetable.h @@ -4,9 +4,10 @@ #include <linux/file.h> #include <linux/io_uring_types.h> +#include "rsrc.h" -bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files); -void io_free_file_tables(struct io_file_table *table); +bool io_alloc_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table, unsigned nr_files); +void io_free_file_tables(struct io_ring_ctx *ctx, struct io_file_table *table); int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags, struct file *file, unsigned int file_slot); @@ -33,50 +34,34 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit) table->alloc_hint = bit + 1; } -static inline struct io_fixed_file * -io_fixed_file_slot(struct io_file_table *table, unsigned i) -{ - return &table->files[i]; -} - #define FFS_NOWAIT 0x1UL #define FFS_ISREG 0x2UL #define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) -static inline unsigned int io_slot_flags(struct io_fixed_file *slot) +static inline unsigned int io_slot_flags(struct io_rsrc_node *node) { - return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; -} -static inline struct file *io_slot_file(struct io_fixed_file *slot) -{ - return (struct file *)(slot->file_ptr & FFS_MASK); + return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT; } -static inline struct file *io_file_from_index(struct io_file_table *table, - int index) +static inline struct file *io_slot_file(struct io_rsrc_node *node) { - return io_slot_file(io_fixed_file_slot(table, index)); + return (struct file *)(node->file_ptr & FFS_MASK); } -static inline void io_fixed_file_set(struct io_fixed_file *file_slot, +static inline void io_fixed_file_set(struct io_rsrc_node *node, struct file *file) { - file_slot->file_ptr = (unsigned long)file | + node->file_ptr = (unsigned long)file | (io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT); } -static inline void io_reset_alloc_hint(struct io_ring_ctx *ctx) -{ - ctx->file_table.alloc_hint = ctx->file_alloc_start; -} - static inline void io_file_table_set_alloc_range(struct io_ring_ctx *ctx, unsigned off, unsigned len) { ctx->file_alloc_start = off; ctx->file_alloc_end = off + len; - io_reset_alloc_hint(ctx); + ctx->file_table.alloc_hint = ctx->file_alloc_start; } #endif diff --git a/io_uring/fs.c b/io_uring/fs.c index eccea851dd5a..37079a414eab 100644 --- a/io_uring/fs.c +++ b/io_uring/fs.c @@ -90,7 +90,7 @@ int io_renameat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_renameat_cleanup(struct io_kiocb *req) @@ -141,7 +141,7 @@ int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_unlinkat_cleanup(struct io_kiocb *req) @@ -185,7 +185,7 @@ int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_mkdirat_cleanup(struct io_kiocb *req) @@ -235,7 +235,7 @@ int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_linkat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -281,7 +281,7 @@ int io_linkat(struct io_kiocb *req, unsigned int issue_flags) req->flags &= ~REQ_F_NEED_CLEANUP; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_link_cleanup(struct io_kiocb *req) diff --git a/io_uring/futex.c b/io_uring/futex.c index 914848f46beb..692462d50c8c 100644 --- a/io_uring/futex.c +++ b/io_uring/futex.c @@ -14,10 +14,7 @@ struct io_futex { struct file *file; - union { - u32 __user *uaddr; - struct futex_waitv __user *uwaitv; - }; + void __user *uaddr; unsigned long futex_val; unsigned long futex_mask; unsigned long futexv_owned; @@ -36,7 +33,7 @@ struct io_futex_data { bool io_futex_cache_init(struct io_ring_ctx *ctx) { return io_alloc_cache_init(&ctx->futex_cache, IO_FUTEX_ALLOC_CACHE_MAX, - sizeof(struct io_futex_data)); + sizeof(struct io_futex_data), 0); } void io_futex_cache_free(struct io_ring_ctx *ctx) @@ -44,30 +41,28 @@ void io_futex_cache_free(struct io_ring_ctx *ctx) io_alloc_cache_free(&ctx->futex_cache, kfree); } -static void __io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void __io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { req->async_data = NULL; hlist_del_init(&req->hash_node); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } -static void io_futex_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futex_complete(struct io_kiocb *req, io_tw_token_t tw) { - struct io_futex_data *ifd = req->async_data; struct io_ring_ctx *ctx = req->ctx; - io_tw_lock(ctx, ts); - if (!io_alloc_cache_put(&ctx->futex_cache, ifd)) - kfree(ifd); - __io_futex_complete(req, ts); + io_tw_lock(ctx, tw); + io_cache_free(&ctx->futex_cache, req->async_data); + __io_futex_complete(req, tw); } -static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_futexv_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct futex_vector *futexv = req->async_data; - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (!iof->futexv_unqueued) { int res; @@ -79,7 +74,7 @@ static void io_futexv_complete(struct io_kiocb *req, struct io_tw_state *ts) kfree(req->async_data); req->flags &= ~REQ_F_ASYNC_DATA; - __io_futex_complete(req, ts); + __io_futex_complete(req, tw); } static bool io_futexv_claim(struct io_futex *iof) @@ -90,7 +85,7 @@ static bool io_futexv_claim(struct io_futex *iof) return true; } -static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_futex_cancel(struct io_kiocb *req) { /* futex wake already done or in progress */ if (req->opcode == IORING_OP_FUTEX_WAIT) { @@ -116,49 +111,13 @@ static bool __io_futex_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_futex_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->futex_list, __io_futex_cancel); } -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->futex_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_futex_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->futex_list, cancel_all, __io_futex_cancel); } int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -186,6 +145,8 @@ int io_futex_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) !futex_validate_input(iof->futex_flags, iof->futex_mask)) return -EINVAL; + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); return 0; } @@ -224,13 +185,15 @@ int io_futexv_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!futexv) return -ENOMEM; - ret = futex_parse_waitv(futexv, iof->uwaitv, iof->futex_nr, + ret = futex_parse_waitv(futexv, iof->uaddr, iof->futex_nr, io_futex_wakev_fn, req); if (ret) { kfree(futexv); return ret; } + /* Mark as inflight, so file exit cancelation will find it */ + io_req_track_inflight(req); iof->futexv_owned = 0; iof->futexv_unqueued = 0; req->flags |= REQ_F_ASYNC_DATA; @@ -251,17 +214,6 @@ static void io_futex_wake_fn(struct wake_q_head *wake_q, struct futex_q *q) io_req_task_work_add(req); } -static struct io_futex_data *io_alloc_ifd(struct io_ring_ctx *ctx) -{ - struct io_futex_data *ifd; - - ifd = io_alloc_cache_get(&ctx->futex_cache); - if (ifd) - return ifd; - - return kmalloc(sizeof(struct io_futex_data), GFP_NOWAIT); -} - int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) { struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); @@ -283,7 +235,7 @@ int io_futexv_wait(struct io_kiocb *req, unsigned int issue_flags) kfree(futexv); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; - return IOU_OK; + return IOU_COMPLETE; } /* @@ -322,7 +274,6 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) struct io_futex *iof = io_kiocb_to_cmd(req, struct io_futex); struct io_ring_ctx *ctx = req->ctx; struct io_futex_data *ifd = NULL; - struct futex_hash_bucket *hb; int ret; if (!iof->futex_mask) { @@ -331,7 +282,7 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) } io_ring_submit_lock(ctx, issue_flags); - ifd = io_alloc_ifd(ctx); + ifd = io_cache_alloc(&ctx->futex_cache, GFP_NOWAIT); if (!ifd) { ret = -ENOMEM; goto done_unlock; @@ -344,12 +295,11 @@ int io_futex_wait(struct io_kiocb *req, unsigned int issue_flags) ifd->req = req; ret = futex_wait_setup(iof->uaddr, iof->futex_val, iof->futex_flags, - &ifd->q, &hb); + &ifd->q, NULL, NULL); if (!ret) { hlist_add_head(&req->hash_node, &ctx->futex_list); io_ring_submit_unlock(ctx, issue_flags); - futex_queue(&ifd->q, hb); return IOU_ISSUE_SKIP_COMPLETE; } @@ -360,7 +310,7 @@ done: req_set_fail(req); io_req_set_res(req, ret, 0); kfree(ifd); - return IOU_OK; + return IOU_COMPLETE; } int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) @@ -377,5 +327,5 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/futex.h b/io_uring/futex.h index b8bb09873d57..d789fcf715e3 100644 --- a/io_uring/futex.h +++ b/io_uring/futex.h @@ -11,7 +11,7 @@ int io_futex_wake(struct io_kiocb *req, unsigned int issue_flags); #if defined(CONFIG_FUTEX) int io_futex_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_futex_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_futex_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); bool io_futex_cache_init(struct io_ring_ctx *ctx); void io_futex_cache_free(struct io_ring_ctx *ctx); @@ -23,7 +23,7 @@ static inline int io_futex_cancel(struct io_ring_ctx *ctx, return 0; } static inline bool io_futex_remove_all(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) + struct io_uring_task *tctx, bool cancel_all) { return false; } diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c index a38f36b68060..be91edf34f01 100644 --- a/io_uring/io-wq.c +++ b/io_uring/io-wq.c @@ -30,7 +30,6 @@ enum { IO_WORKER_F_UP = 0, /* up and active */ IO_WORKER_F_RUNNING = 1, /* account as running */ IO_WORKER_F_FREE = 2, /* worker on free list */ - IO_WORKER_F_BOUND = 3, /* is doing bounded work */ }; enum { @@ -46,12 +45,12 @@ enum { */ struct io_worker { refcount_t ref; - int create_index; unsigned long flags; struct hlist_nulls_node nulls_node; struct list_head all_list; struct task_struct *task; struct io_wq *wq; + struct io_wq_acct *acct; struct io_wq_work *cur_work; raw_spinlock_t lock; @@ -64,7 +63,7 @@ struct io_worker { union { struct rcu_head rcu; - struct work_struct work; + struct delayed_work work; }; }; @@ -77,10 +76,27 @@ struct io_worker { #define IO_WQ_NR_HASH_BUCKETS (1u << IO_WQ_HASH_ORDER) struct io_wq_acct { + /** + * Protects access to the worker lists. + */ + raw_spinlock_t workers_lock; + unsigned nr_workers; unsigned max_workers; - int index; atomic_t nr_running; + + /** + * The list of free workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct hlist_nulls_head free_list; + + /** + * The list of all workers. Protected by #workers_lock + * (write) and RCU (read). + */ + struct list_head all_list; + raw_spinlock_t lock; struct io_wq_work_list work_list; unsigned long flags; @@ -98,9 +114,6 @@ enum { struct io_wq { unsigned long state; - free_work_fn *free_work; - io_wq_work_fn *do_work; - struct io_wq_hash *hash; atomic_t worker_refs; @@ -112,12 +125,6 @@ struct io_wq { struct io_wq_acct acct[IO_WQ_ACCT_NR]; - /* lock protects access to elements below */ - raw_spinlock_t lock; - - struct hlist_nulls_head free_list; - struct list_head all_list; - struct wait_queue_entry wait; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; @@ -135,7 +142,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static bool create_io_worker(struct io_wq *wq, int index); +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct); static void io_wq_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wq *wq, struct io_wq_acct *acct, @@ -143,6 +150,16 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, static void create_worker_cb(struct callback_head *cb); static void io_wq_cancel_tw_create(struct io_wq *wq); +static inline unsigned int __io_get_work_hash(unsigned int work_flags) +{ + return work_flags >> IO_WQ_HASH_SHIFT; +} + +static inline unsigned int io_get_work_hash(struct io_wq_work *work) +{ + return __io_get_work_hash(atomic_read(&work->flags)); +} + static bool io_worker_get(struct io_worker *worker) { return refcount_inc_not_zero(&worker->ref); @@ -160,14 +177,14 @@ static inline struct io_wq_acct *io_get_acct(struct io_wq *wq, bool bound) } static inline struct io_wq_acct *io_work_get_acct(struct io_wq *wq, - struct io_wq_work *work) + unsigned int work_flags) { - return io_get_acct(wq, !(atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)); + return io_get_acct(wq, !(work_flags & IO_WQ_WORK_UNBOUND)); } static inline struct io_wq_acct *io_wq_get_acct(struct io_worker *worker) { - return io_get_acct(worker->wq, test_bit(IO_WORKER_F_BOUND, &worker->flags)); + return worker->acct; } static void io_worker_ref_put(struct io_wq *wq) @@ -192,9 +209,9 @@ static void io_worker_cancel_cb(struct io_worker *worker) struct io_wq *wq = worker->wq; atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -213,6 +230,7 @@ static bool io_task_worker_match(struct callback_head *cb, void *data) static void io_worker_exit(struct io_worker *worker) { struct io_wq *wq = worker->wq; + struct io_wq_acct *acct = io_wq_get_acct(worker); while (1) { struct callback_head *cb = task_work_cancel_match(wq->task, @@ -226,11 +244,11 @@ static void io_worker_exit(struct io_worker *worker) io_worker_release(worker); wait_for_completion(&worker->ref_done); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (test_bit(IO_WORKER_F_FREE, &worker->flags)) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_wq_dec_running(worker); /* * this worker is a goner, clear ->worker_private to avoid any @@ -269,8 +287,7 @@ static inline bool io_acct_run_queue(struct io_wq_acct *acct) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wq_activate_free_worker(struct io_wq *wq, - struct io_wq_acct *acct) +static bool io_acct_activate_free_worker(struct io_wq_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -281,13 +298,9 @@ static bool io_wq_activate_free_worker(struct io_wq *wq, * activate. If a given worker is on the free_list but in the process * of exiting, keep trying. */ - hlist_nulls_for_each_entry_rcu(worker, n, &wq->free_list, nulls_node) { + hlist_nulls_for_each_entry_rcu(worker, n, &acct->free_list, nulls_node) { if (!io_worker_get(worker)) continue; - if (io_wq_get_acct(worker) != acct) { - io_worker_release(worker); - continue; - } /* * If the worker is already running, it's either already * starting work or finishing work. In either case, if it does @@ -314,16 +327,16 @@ static bool io_wq_create_worker(struct io_wq *wq, struct io_wq_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers >= acct->max_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return true; } acct->nr_workers++; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); atomic_inc(&acct->nr_running); atomic_inc(&wq->worker_refs); - return create_io_worker(wq, acct->index); + return create_io_worker(wq, acct); } static void io_wq_inc_running(struct io_worker *worker) @@ -343,16 +356,16 @@ static void create_worker_cb(struct callback_head *cb) worker = container_of(cb, struct io_worker, create_work); wq = worker->wq; - acct = &wq->acct[worker->create_index]; - raw_spin_lock(&wq->lock); + acct = worker->acct; + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers < acct->max_workers) { acct->nr_workers++; do_create = true; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); if (do_create) { - create_io_worker(wq, worker->create_index); + create_io_worker(wq, acct); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -384,7 +397,6 @@ static bool io_queue_worker_create(struct io_worker *worker, atomic_inc(&wq->worker_refs); init_task_work(&worker->create_work, func); - worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { /* * EXIT may have been set after checking it above, check after @@ -407,6 +419,30 @@ fail: return false; } +/* Defer if current and next work are both hashed to the same chain */ +static bool io_wq_hash_defer(struct io_wq_work *work, struct io_wq_acct *acct) +{ + unsigned int hash, work_flags; + struct io_wq_work *next; + + lockdep_assert_held(&acct->lock); + + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + + /* should not happen, io_acct_run_queue() said we had work */ + if (wq_list_empty(&acct->work_list)) + return true; + + hash = __io_get_work_hash(work_flags); + next = container_of(acct->work_list.first, struct io_wq_work, list); + work_flags = atomic_read(&next->flags); + if (!__io_wq_is_hashed(work_flags)) + return false; + return hash == __io_get_work_hash(work_flags); +} + static void io_wq_dec_running(struct io_worker *worker) { struct io_wq_acct *acct = io_wq_get_acct(worker); @@ -417,8 +453,14 @@ static void io_wq_dec_running(struct io_worker *worker) if (!atomic_dec_and_test(&acct->nr_running)) return; + if (!worker->cur_work) + return; if (!io_acct_run_queue(acct)) return; + if (io_wq_hash_defer(worker->cur_work, acct)) { + raw_spin_unlock(&acct->lock); + return; + } raw_spin_unlock(&acct->lock); atomic_inc(&acct->nr_running); @@ -430,33 +472,28 @@ static void io_wq_dec_running(struct io_worker *worker) * Worker will start processing some work. Move it to the busy list, if * it's currently on the freelist */ -static void __io_worker_busy(struct io_wq *wq, struct io_worker *worker) +static void __io_worker_busy(struct io_wq_acct *acct, struct io_worker *worker) { if (test_bit(IO_WORKER_F_FREE, &worker->flags)) { clear_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); hlist_nulls_del_init_rcu(&worker->nulls_node); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } } /* * No work, worker going to sleep. Move to freelist. */ -static void __io_worker_idle(struct io_wq *wq, struct io_worker *worker) - __must_hold(wq->lock) +static void __io_worker_idle(struct io_wq_acct *acct, struct io_worker *worker) + __must_hold(acct->workers_lock) { if (!test_bit(IO_WORKER_F_FREE, &worker->flags)) { set_bit(IO_WORKER_F_FREE, &worker->flags); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); } } -static inline unsigned int io_get_work_hash(struct io_wq_work *work) -{ - return atomic_read(&work->flags) >> IO_WQ_HASH_SHIFT; -} - static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) { bool ret = false; @@ -475,26 +512,27 @@ static bool io_wait_on_hash(struct io_wq *wq, unsigned int hash) } static struct io_wq_work *io_get_next_work(struct io_wq_acct *acct, - struct io_worker *worker) + struct io_wq *wq) __must_hold(acct->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; - struct io_wq *wq = worker->wq; wq_list_for_each(node, prev, &acct->work_list) { + unsigned int work_flags; unsigned int hash; work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ - if (!io_wq_is_hashed(work)) { + work_flags = atomic_read(&work->flags); + if (!__io_wq_is_hashed(work_flags)) { wq_list_del(&acct->work_list, node, prev); return work; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); /* all items with this hash lie in [work, tail] */ tail = wq->hash_tail[hash]; @@ -564,7 +602,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(acct, worker); + work = io_get_next_work(acct, wq); if (work) { /* * Make sure cancelation can find this, even before @@ -583,7 +621,7 @@ static void io_worker_handle_work(struct io_wq_acct *acct, if (!work) break; - __io_worker_busy(wq, worker); + __io_worker_busy(acct, worker); io_assign_current_work(worker, work); __set_current_state(TASK_RUNNING); @@ -591,17 +629,20 @@ static void io_worker_handle_work(struct io_wq_acct *acct, /* handle a whole dependent link */ do { struct io_wq_work *next_hashed, *linked; - unsigned int hash = io_get_work_hash(work); + unsigned int work_flags = atomic_read(&work->flags); + unsigned int hash = __io_wq_is_hashed(work_flags) + ? __io_get_work_hash(work_flags) + : -1U; next_hashed = wq_next_work(work); if (do_kill && - (atomic_read(&work->flags) & IO_WQ_WORK_UNBOUND)) + (work_flags & IO_WQ_WORK_UNBOUND)) atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); + io_wq_submit_work(work); io_assign_current_work(worker, NULL); - linked = wq->free_work(work); + linked = io_wq_free_work(work); work = next_hashed; if (!work && linked && !io_wq_is_hashed(linked)) { work = linked; @@ -634,7 +675,7 @@ static int io_wq_worker(void *data) struct io_wq_acct *acct = io_wq_get_acct(worker); struct io_wq *wq = worker->wq; bool exit_mask = false, last_timeout = false; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; set_mask_bits(&worker->flags, 0, BIT(IO_WORKER_F_UP) | BIT(IO_WORKER_F_RUNNING)); @@ -654,20 +695,20 @@ static int io_wq_worker(void *data) while (io_acct_run_queue(acct)) io_worker_handle_work(acct, worker); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); /* * Last sleep timed out. Exit if we're not the last worker, * or if someone modified our affinity. */ if (last_timeout && (exit_mask || acct->nr_workers > 1)) { acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); __set_current_state(TASK_RUNNING); break; } last_timeout = false; - __io_worker_idle(wq, worker); - raw_spin_unlock(&wq->lock); + __io_worker_idle(acct, worker); + raw_spin_unlock(&acct->workers_lock); if (io_run_task_work()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -728,18 +769,18 @@ void io_wq_worker_sleeping(struct task_struct *tsk) io_wq_dec_running(worker); } -static void io_init_new_worker(struct io_wq *wq, struct io_worker *worker, +static void io_init_new_worker(struct io_wq *wq, struct io_wq_acct *acct, struct io_worker *worker, struct task_struct *tsk) { tsk->worker_private = worker; worker->task = tsk; set_cpus_allowed_ptr(tsk, wq->cpu_mask); - raw_spin_lock(&wq->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wq->free_list); - list_add_tail_rcu(&worker->all_list, &wq->all_list); + raw_spin_lock(&acct->workers_lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &acct->free_list); + list_add_tail_rcu(&worker->all_list, &acct->all_list); set_bit(IO_WORKER_F_FREE, &worker->flags); - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); wake_up_new_task(tsk); } @@ -770,25 +811,37 @@ static inline bool io_should_retry_thread(struct io_worker *worker, long err) } } +static void queue_create_worker_retry(struct io_worker *worker) +{ + /* + * We only bother retrying because there's a chance that the + * failure to create a worker is due to some temporary condition + * in the forking task (e.g. outstanding signal); give the task + * some time to clear that condition. + */ + schedule_delayed_work(&worker->work, + msecs_to_jiffies(worker->init_retries * 5)); +} + static void create_worker_cont(struct callback_head *cb) { struct io_worker *worker; struct task_struct *tsk; struct io_wq *wq; + struct io_wq_acct *acct; worker = container_of(cb, struct io_worker, create_work); clear_bit_unlock(0, &worker->create_state); wq = worker->wq; + acct = io_wq_get_acct(worker); tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); io_worker_release(worker); return; } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { - struct io_wq_acct *acct = io_wq_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; if (!acct->nr_workers) { struct io_cb_cancel_data match = { @@ -796,11 +849,11 @@ static void create_worker_cont(struct callback_head *cb) .cancel_all = true, }; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); while (io_acct_cancel_pending_work(wq, acct, &match)) ; } else { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); } io_worker_ref_put(wq); kfree(worker); @@ -809,21 +862,21 @@ static void create_worker_cont(struct callback_head *cb) /* re-create attempts grab a new worker ref, drop the existing one */ io_worker_release(worker); - schedule_work(&worker->work); + queue_create_worker_retry(worker); } static void io_workqueue_create(struct work_struct *work) { - struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_worker *worker = container_of(work, struct io_worker, + work.work); struct io_wq_acct *acct = io_wq_get_acct(worker); if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); } -static bool create_io_worker(struct io_wq *wq, int index) +static bool create_io_worker(struct io_wq *wq, struct io_wq_acct *acct) { - struct io_wq_acct *acct = &wq->acct[index]; struct io_worker *worker; struct task_struct *tsk; @@ -833,30 +886,28 @@ static bool create_io_worker(struct io_wq *wq, int index) if (!worker) { fail: atomic_dec(&acct->nr_running); - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); acct->nr_workers--; - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); io_worker_ref_put(wq); return false; } refcount_set(&worker->ref, 1); worker->wq = wq; + worker->acct = acct; raw_spin_lock_init(&worker->lock); init_completion(&worker->ref_done); - if (index == IO_WQ_ACCT_BOUND) - set_bit(IO_WORKER_F_BOUND, &worker->flags); - tsk = create_io_thread(io_wq_worker, worker, NUMA_NO_NODE); if (!IS_ERR(tsk)) { - io_init_new_worker(wq, worker, tsk); + io_init_new_worker(wq, acct, worker, tsk); } else if (!io_should_retry_thread(worker, PTR_ERR(tsk))) { kfree(worker); goto fail; } else { - INIT_WORK(&worker->work, io_workqueue_create); - schedule_work(&worker->work); + INIT_DELAYED_WORK(&worker->work, io_workqueue_create); + queue_create_worker_retry(worker); } return true; @@ -866,14 +917,14 @@ fail: * Iterate the passed in list and call the specific function for each * worker that isn't exiting */ -static bool io_wq_for_each_worker(struct io_wq *wq, - bool (*func)(struct io_worker *, void *), - void *data) +static bool io_acct_for_each_worker(struct io_wq_acct *acct, + bool (*func)(struct io_worker *, void *), + void *data) { struct io_worker *worker; bool ret = false; - list_for_each_entry_rcu(worker, &wq->all_list, all_list) { + list_for_each_entry_rcu(worker, &acct->all_list, all_list) { if (io_worker_get(worker)) { /* no task if node is/was offline */ if (worker->task) @@ -887,6 +938,18 @@ static bool io_wq_for_each_worker(struct io_wq *wq, return ret; } +static bool io_wq_for_each_worker(struct io_wq *wq, + bool (*func)(struct io_worker *, void *), + void *data) +{ + for (int i = 0; i < IO_WQ_ACCT_NR; i++) { + if (!io_acct_for_each_worker(&wq->acct[i], func, data)) + return false; + } + + return true; +} + static bool io_wq_worker_wake(struct io_worker *worker, void *data) { __set_notify_signal(worker->task); @@ -898,24 +961,24 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq) { do { atomic_or(IO_WQ_WORK_CANCEL, &work->flags); - wq->do_work(work); - work = wq->free_work(work); + io_wq_submit_work(work); + work = io_wq_free_work(work); } while (work); } -static void io_wq_insert_work(struct io_wq *wq, struct io_wq_work *work) +static void io_wq_insert_work(struct io_wq *wq, struct io_wq_acct *acct, + struct io_wq_work *work, unsigned int work_flags) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash; struct io_wq_work *tail; - if (!io_wq_is_hashed(work)) { + if (!__io_wq_is_hashed(work_flags)) { append: wq_list_add_tail(&work->list, &acct->work_list); return; } - hash = io_get_work_hash(work); + hash = __io_get_work_hash(work_flags); tail = wq->hash_tail[hash]; wq->hash_tail[hash] = work; if (!tail) @@ -931,8 +994,8 @@ static bool io_wq_work_match_item(struct io_wq_work *work, void *data) void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int work_flags = atomic_read(&work->flags); + struct io_wq_acct *acct = io_work_get_acct(wq, work_flags); struct io_cb_cancel_data match = { .fn = io_wq_work_match_item, .data = work, @@ -951,12 +1014,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) } raw_spin_lock(&acct->lock); - io_wq_insert_work(wq, work); + io_wq_insert_work(wq, acct, work, work_flags); clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&acct->lock); rcu_read_lock(); - do_create = !io_wq_activate_free_worker(wq, acct); + do_create = !io_acct_activate_free_worker(acct); rcu_read_unlock(); if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || @@ -967,12 +1030,12 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) if (likely(did_create)) return; - raw_spin_lock(&wq->lock); + raw_spin_lock(&acct->workers_lock); if (acct->nr_workers) { - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); return; } - raw_spin_unlock(&wq->lock); + raw_spin_unlock(&acct->workers_lock); /* fatal condition, failed to create the first worker */ io_acct_cancel_pending_work(wq, acct, &match); @@ -1021,10 +1084,10 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) } static inline void io_wq_remove_pending(struct io_wq *wq, + struct io_wq_acct *acct, struct io_wq_work *work, struct io_wq_work_node *prev) { - struct io_wq_acct *acct = io_work_get_acct(wq, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -1051,7 +1114,7 @@ static bool io_acct_cancel_pending_work(struct io_wq *wq, work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; - io_wq_remove_pending(wq, work, prev); + io_wq_remove_pending(wq, acct, work, prev); raw_spin_unlock(&acct->lock); io_run_cancel(work, wq); match->nr_pending++; @@ -1079,11 +1142,22 @@ retry: } } +static void io_acct_cancel_running_work(struct io_wq_acct *acct, + struct io_cb_cancel_data *match) +{ + raw_spin_lock(&acct->workers_lock); + io_acct_for_each_worker(acct, io_wq_worker_cancel, match); + raw_spin_unlock(&acct->workers_lock); +} + static void io_wq_cancel_running_work(struct io_wq *wq, struct io_cb_cancel_data *match) { rcu_read_lock(); - io_wq_for_each_worker(wq, io_wq_worker_cancel, match); + + for (int i = 0; i < IO_WQ_ACCT_NR; i++) + io_acct_cancel_running_work(&wq->acct[i], match); + rcu_read_unlock(); } @@ -1106,16 +1180,14 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, * as an indication that we attempt to signal cancellation. The * completion will run normally in this case. * - * Do both of these while holding the wq->lock, to ensure that + * Do both of these while holding the acct->workers_lock, to ensure that * we'll find a work item regardless of state. */ io_wq_cancel_pending_work(wq, &match); if (match.nr_pending && !match.cancel_all) return IO_WQ_CANCEL_OK; - raw_spin_lock(&wq->lock); io_wq_cancel_running_work(wq, &match); - raw_spin_unlock(&wq->lock); if (match.nr_running && !match.cancel_all) return IO_WQ_CANCEL_RUNNING; @@ -1139,7 +1211,7 @@ static int io_wq_hash_wake(struct wait_queue_entry *wait, unsigned mode, struct io_wq_acct *acct = &wq->acct[i]; if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) - io_wq_activate_free_worker(wq, acct); + io_acct_activate_free_worker(acct); } rcu_read_unlock(); return 1; @@ -1150,8 +1222,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret, i; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work || !data->do_work)) - return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(!bounded)) return ERR_PTR(-EINVAL); @@ -1161,8 +1231,6 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) refcount_inc(&data->hash->refs); wq->hash = data->hash; - wq->free_work = data->free_work; - wq->do_work = data->do_work; ret = -ENOMEM; @@ -1177,22 +1245,24 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) for (i = 0; i < IO_WQ_ACCT_NR; i++) { struct io_wq_acct *acct = &wq->acct[i]; - acct->index = i; atomic_set(&acct->nr_running, 0); + + raw_spin_lock_init(&acct->workers_lock); + INIT_HLIST_NULLS_HEAD(&acct->free_list, 0); + INIT_LIST_HEAD(&acct->all_list); + INIT_WQ_LIST(&acct->work_list); raw_spin_lock_init(&acct->lock); } - raw_spin_lock_init(&wq->lock); - INIT_HLIST_NULLS_HEAD(&wq->free_list, 0); - INIT_LIST_HEAD(&wq->all_list); - wq->task = get_task_struct(data->task); atomic_set(&wq->worker_refs, 1); init_completion(&wq->worker_done); ret = cpuhp_state_add_instance_nocalls(io_wq_online, &wq->cpuhp_node); - if (ret) + if (ret) { + put_task_struct(wq->task); goto err; + } return wq; err: @@ -1372,14 +1442,14 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); - raw_spin_lock(&wq->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->acct[i]; + raw_spin_lock(&acct->workers_lock); prev[i] = max_t(int, acct->max_workers, prev[i]); if (new_count[i]) acct->max_workers = new_count[i]; + raw_spin_unlock(&acct->workers_lock); } - raw_spin_unlock(&wq->lock); rcu_read_unlock(); for (i = 0; i < IO_WQ_ACCT_NR; i++) diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h index b3b004a7b625..774abab54732 100644 --- a/io_uring/io-wq.h +++ b/io_uring/io-wq.h @@ -21,9 +21,6 @@ enum io_wq_cancel { IO_WQ_CANCEL_NOTFOUND, /* work not found */ }; -typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); -typedef void (io_wq_work_fn)(struct io_wq_work *); - struct io_wq_hash { refcount_t refs; unsigned long map; @@ -39,8 +36,6 @@ static inline void io_wq_put_hash(struct io_wq_hash *hash) struct io_wq_data { struct io_wq_hash *hash; struct task_struct *task; - io_wq_work_fn *do_work; - free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); @@ -54,9 +49,14 @@ int io_wq_cpu_affinity(struct io_uring_task *tctx, cpumask_var_t mask); int io_wq_max_workers(struct io_wq *wq, int *new_count); bool io_wq_worker_stopped(void); +static inline bool __io_wq_is_hashed(unsigned int work_flags) +{ + return work_flags & IO_WQ_WORK_HASHED; +} + static inline bool io_wq_is_hashed(struct io_wq_work *work) { - return atomic_read(&work->flags) & IO_WQ_WORK_HASHED; + return __io_wq_is_hashed(atomic_read(&work->flags)); } typedef bool (work_cancel_fn)(struct io_wq_work *, void *); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index b2736e3491b8..5111ec040c53 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -51,7 +51,6 @@ #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> -#include <linux/fdtable.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/percpu.h> @@ -70,6 +69,7 @@ #include <linux/io_uring/cmd.h> #include <linux/audit.h> #include <linux/security.h> +#include <linux/jump_label.h> #include <asm/shmparam.h> #define CREATE_TRACE_POINTS @@ -97,6 +97,7 @@ #include "uring_cmd.h" #include "msg_ring.h" #include "memmap.h" +#include "zcrx.h" #include "timeout.h" #include "poll.h" @@ -104,36 +105,34 @@ #include "alloc_cache.h" #include "eventfd.h" -#define IORING_MAX_ENTRIES 32768 -#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) - #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ IOSQE_IO_HARDLINK | IOSQE_ASYNC) #define SQE_VALID_FLAGS (SQE_COMMON_FLAGS | IOSQE_BUFFER_SELECT | \ IOSQE_IO_DRAIN | IOSQE_CQE_SKIP_SUCCESS) +#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ REQ_F_ASYNC_DATA) -#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | REQ_F_LINK | REQ_F_HARDLINK |\ - IO_REQ_CLEAN_FLAGS) +#define IO_REQ_CLEAN_SLOW_FLAGS (REQ_F_REFCOUNT | IO_REQ_LINK_FLAGS | \ + REQ_F_REISSUE | IO_REQ_CLEAN_FLAGS) #define IO_TCTX_REFS_CACHE_NR (1U << 10) #define IO_COMPL_BATCH 32 #define IO_REQ_ALLOC_BATCH 8 +#define IO_LOCAL_TW_DEFAULT_MAX 20 struct io_defer_entry { struct list_head list; struct io_kiocb *req; - u32 seq; }; /* requests with any of those set should undergo io_disarm_next() */ #define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) -#define IO_REQ_LINK_FLAGS (REQ_F_LINK | REQ_F_HARDLINK) /* * No waiters. It's larger than any valid value of the tw counter @@ -144,10 +143,14 @@ struct io_defer_entry { #define IO_CQ_WAKE_FORCE (IO_CQ_WAKE_INIT >> 1) static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all); + struct io_uring_task *tctx, + bool cancel_all, + bool is_sqpoll_thread); static void io_queue_sqe(struct io_kiocb *req); +static void __io_req_caches_free(struct io_ring_ctx *ctx); + +static __read_mostly DEFINE_STATIC_KEY_FALSE(io_key_has_sqarray); struct kmem_cache *req_cachep; static struct workqueue_struct *iou_wq __ro_after_init; @@ -156,7 +159,7 @@ static int __read_mostly sysctl_io_uring_disabled; static int __read_mostly sysctl_io_uring_group = -1; #ifdef CONFIG_SYSCTL -static struct ctl_table kernel_io_uring_disabled_table[] = { +static const struct ctl_table kernel_io_uring_disabled_table[] = { { .procname = "io_uring_disabled", .data = &sysctl_io_uring_disabled, @@ -201,12 +204,12 @@ static bool io_match_linked(struct io_kiocb *head) * As io_match_task() but protected against racing with linked timeouts. * User must not hold timeout_lock. */ -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) { bool matched; - if (task && head->task != task) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -215,9 +218,9 @@ bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, struct io_ring_ctx *ctx = head->ctx; /* protect against races with linked timeouts */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); matched = io_match_linked(head); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { matched = io_match_linked(head); } @@ -253,7 +256,7 @@ static __cold void io_fallback_req_func(struct work_struct *work) percpu_ref_get(&ctx->refs); mutex_lock(&ctx->uring_lock); llist_for_each_entry_safe(req, tmp, node, io_task_work.node) - req->io_task_work.func(req, &ts); + req->io_task_work.func(req, ts); io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); @@ -261,18 +264,37 @@ static __cold void io_fallback_req_func(struct work_struct *work) static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits) { - unsigned hash_buckets = 1U << bits; - size_t hash_size = hash_buckets * sizeof(table->hbs[0]); + unsigned int hash_buckets; + int i; - table->hbs = kmalloc(hash_size, GFP_KERNEL); - if (!table->hbs) - return -ENOMEM; + do { + hash_buckets = 1U << bits; + table->hbs = kvmalloc_array(hash_buckets, sizeof(table->hbs[0]), + GFP_KERNEL_ACCOUNT); + if (table->hbs) + break; + if (bits == 1) + return -ENOMEM; + bits--; + } while (1); table->hash_bits = bits; - init_hash_table(table, hash_buckets); + for (i = 0; i < hash_buckets; i++) + INIT_HLIST_HEAD(&table->hbs[i].list); return 0; } +static void io_free_alloc_caches(struct io_ring_ctx *ctx) +{ + io_alloc_cache_free(&ctx->apoll_cache, kfree); + io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); + io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); + io_alloc_cache_free(&ctx->cmd_cache, io_cmd_cache_free); + io_alloc_cache_free(&ctx->msg_cache, kfree); + io_futex_cache_free(ctx); + io_rsrc_cache_free(ctx); +} + static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -294,32 +316,32 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) hash_bits = clamp(hash_bits, 1, 8); if (io_alloc_hash_table(&ctx->cancel_table, hash_bits)) goto err; - if (io_alloc_hash_table(&ctx->cancel_table_locked, hash_bits)) - goto err; if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) goto err; ctx->flags = p->flags; + ctx->hybrid_poll_time = LLONG_MAX; atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); INIT_LIST_HEAD(&ctx->cq_overflow_list); - INIT_LIST_HEAD(&ctx->io_buffers_cache); - ret = io_alloc_cache_init(&ctx->rsrc_node_cache, IO_NODE_ALLOC_CACHE_MAX, - sizeof(struct io_rsrc_node)); - ret |= io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, - sizeof(struct async_poll)); + ret = io_alloc_cache_init(&ctx->apoll_cache, IO_POLL_ALLOC_CACHE_MAX, + sizeof(struct async_poll), 0); ret |= io_alloc_cache_init(&ctx->netmsg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_msghdr)); + sizeof(struct io_async_msghdr), + offsetof(struct io_async_msghdr, clear)); ret |= io_alloc_cache_init(&ctx->rw_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_async_rw)); - ret |= io_alloc_cache_init(&ctx->uring_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct uring_cache)); + sizeof(struct io_async_rw), + offsetof(struct io_async_rw, clear)); + ret |= io_alloc_cache_init(&ctx->cmd_cache, IO_ALLOC_CACHE_MAX, + sizeof(struct io_async_cmd), + sizeof(struct io_async_cmd)); spin_lock_init(&ctx->msg_lock); ret |= io_alloc_cache_init(&ctx->msg_cache, IO_ALLOC_CACHE_MAX, - sizeof(struct io_kiocb)); + sizeof(struct io_kiocb), 0); ret |= io_futex_cache_init(ctx); + ret |= io_rsrc_cache_init(ctx); if (ret) goto free_ref; init_completion(&ctx->ref_comp); @@ -327,19 +349,17 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); init_waitqueue_head(&ctx->poll_wq); - init_waitqueue_head(&ctx->rsrc_quiesce_wq); spin_lock_init(&ctx->completion_lock); - spin_lock_init(&ctx->timeout_lock); + raw_spin_lock_init(&ctx->timeout_lock); INIT_WQ_LIST(&ctx->iopoll_list); - INIT_LIST_HEAD(&ctx->io_buffers_comp); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); - INIT_LIST_HEAD(&ctx->rsrc_ref_list); init_llist_head(&ctx->work_llist); INIT_LIST_HEAD(&ctx->tctx_list); ctx->submit_state.free_list.next = NULL; INIT_HLIST_HEAD(&ctx->waitid_list); + xa_init_flags(&ctx->zcrx_ctxs, XA_FLAGS_ALLOC); #ifdef CONFIG_FUTEX INIT_HLIST_HEAD(&ctx->futex_list); #endif @@ -347,52 +367,24 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_WQ_LIST(&ctx->submit_state.compl_reqs); INIT_HLIST_HEAD(&ctx->cancelable_uring_cmd); io_napi_init(ctx); + mutex_init(&ctx->mmap_lock); return ctx; free_ref: percpu_ref_exit(&ctx->refs); err: - io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); - io_futex_cache_free(ctx); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); + io_free_alloc_caches(ctx); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); return NULL; } -static void io_account_cq_overflow(struct io_ring_ctx *ctx) -{ - struct io_rings *r = ctx->rings; - - WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); - ctx->cq_extra--; -} - -static bool req_need_defer(struct io_kiocb *req, u32 seq) -{ - if (unlikely(req->flags & REQ_F_IO_DRAIN)) { - struct io_ring_ctx *ctx = req->ctx; - - return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail; - } - - return false; -} - static void io_clean_op(struct io_kiocb *req) { - if (req->flags & REQ_F_BUFFER_SELECTED) { - spin_lock(&req->ctx->completion_lock); - io_kbuf_drop(req); - spin_unlock(&req->ctx->completion_lock); - } + if (unlikely(req->flags & REQ_F_BUFFER_SELECTED)) + io_kbuf_drop_legacy(req); if (req->flags & REQ_F_NEED_CLEANUP) { const struct io_cold_def *def = &io_cold_defs[req->opcode]; @@ -405,11 +397,8 @@ static void io_clean_op(struct io_kiocb *req) kfree(req->apoll); req->apoll = NULL; } - if (req->flags & REQ_F_INFLIGHT) { - struct io_uring_task *tctx = req->task->io_uring; - - atomic_dec(&tctx->inflight_tracked); - } + if (req->flags & REQ_F_INFLIGHT) + atomic_dec(&req->tctx->inflight_tracked); if (req->flags & REQ_F_CREDS) put_cred(req->creds); if (req->flags & REQ_F_ASYNC_DATA) { @@ -419,11 +408,16 @@ static void io_clean_op(struct io_kiocb *req) req->flags &= ~IO_REQ_CLEAN_FLAGS; } -static inline void io_req_track_inflight(struct io_kiocb *req) +/* + * Mark the request as inflight, so that file cancelation will find it. + * Can be used if the file is an io_uring instance, or if the request itself + * relies on ->mm being alive for the duration of the request. + */ +inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; - atomic_inc(&req->task->io_uring->inflight_tracked); + atomic_inc(&req->tctx->inflight_tracked); } } @@ -441,24 +435,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_issue_def *def = &io_issue_defs[req->opcode]; @@ -499,10 +475,10 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); @@ -511,11 +487,14 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; BUG_ON(!tctx); - BUG_ON(!tctx->io_wq); + + if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } /* init ->work of the whole link before punting */ io_prep_async_link(req); @@ -527,16 +506,14 @@ static void io_queue_iowq(struct io_kiocb *req) * procedure rather than attempt to run this request (or create a new * worker for it). */ - if (WARN_ON_ONCE(!same_thread_group(req->task, current))) + if (WARN_ON_ONCE(!same_thread_group(tctx->task, current))) atomic_or(IO_WQ_WORK_CANCEL, &req->work.flags); trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } -static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_queue_iowq_tw(struct io_kiocb *req, io_tw_token_t tw) { io_queue_iowq(req); } @@ -547,17 +524,36 @@ void io_req_queue_iowq(struct io_kiocb *req) io_req_task_work_add(req); } -static __cold void io_queue_deferred(struct io_ring_ctx *ctx) +static unsigned io_linked_nr(struct io_kiocb *req) { + struct io_kiocb *tmp; + unsigned nr = 0; + + io_for_each_link(tmp, req) + nr++; + return nr; +} + +static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx) +{ + bool drain_seen = false, first = true; + + lockdep_assert_held(&ctx->uring_lock); + __io_req_caches_free(ctx); + while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, struct io_defer_entry, list); - if (req_need_defer(de->req, de->seq)) - break; + drain_seen |= de->req->flags & REQ_F_IO_DRAIN; + if ((drain_seen || first) && ctx->nr_req_allocated != ctx->nr_drained) + return; + list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue(de->req); kfree(de); + first = false; } } @@ -567,13 +563,8 @@ void __io_commit_cqring_flush(struct io_ring_ctx *ctx) io_poll_wq_wake(ctx); if (ctx->off_timeout_used) io_flush_timeouts(ctx); - if (ctx->drain_active) { - spin_lock(&ctx->completion_lock); - io_queue_deferred(ctx); - spin_unlock(&ctx->completion_lock); - } if (ctx->has_evfd) - io_eventfd_flush_signal(ctx); + io_eventfd_signal(ctx, true); } static inline void __io_cq_lock(struct io_ring_ctx *ctx) @@ -647,6 +638,7 @@ static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying) * to care for a non-real case. */ if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); @@ -675,30 +667,19 @@ static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -/* can be called by any task */ -static void io_put_task_remote(struct task_struct *task) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, 1); - if (unlikely(atomic_read(&tctx->in_cancel))) - wake_up(&tctx->wait); - put_task_struct(task); -} - -/* used by a task to put its own references */ -static void io_put_task_local(struct task_struct *task) -{ - task->io_uring->cached_refs++; -} - /* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task) +static inline void io_put_task(struct io_kiocb *req) { - if (likely(task == current)) - io_put_task_local(task); - else - io_put_task_remote(task); + struct io_uring_task *tctx = req->tctx; + + if (likely(tctx->task == current)) { + tctx->cached_refs++; + } else { + percpu_counter_sub(&tctx->inflight, 1); + if (unlikely(atomic_read(&tctx->in_cancel))) + wake_up(&tctx->wait); + put_task_struct(tctx->task); + } } void io_task_refs_refill(struct io_uring_task *tctx) @@ -722,27 +703,20 @@ static __cold void io_uring_drop_tctx_refs(struct task_struct *task) } } -static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - s32 res, u32 cflags, u64 extra1, u64 extra2) +static __cold bool io_cqring_add_overflow(struct io_ring_ctx *ctx, + struct io_overflow_cqe *ocqe) { - struct io_overflow_cqe *ocqe; - size_t ocq_size = sizeof(struct io_overflow_cqe); - bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); - lockdep_assert_held(&ctx->completion_lock); - if (is_cqe32) - ocq_size += sizeof(struct io_uring_cqe); - - ocqe = kmalloc(ocq_size, GFP_ATOMIC | __GFP_ACCOUNT); - trace_io_uring_cqe_overflow(ctx, user_data, res, cflags, ocqe); if (!ocqe) { + struct io_rings *r = ctx->rings; + /* * If we're in ring overflow flush mode, or in task cancel mode, * or cannot allocate an overflow entry, then we need to drop it * on the floor. */ - io_account_cq_overflow(ctx); + WRITE_ONCE(r->cq_overflow, READ_ONCE(r->cq_overflow) + 1); set_bit(IO_CHECK_CQ_DROPPED_BIT, &ctx->check_cq); return false; } @@ -751,23 +725,35 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, atomic_or(IORING_SQ_CQ_OVERFLOW, &ctx->rings->sq_flags); } - ocqe->cqe.user_data = user_data; - ocqe->cqe.res = res; - ocqe->cqe.flags = cflags; - if (is_cqe32) { - ocqe->cqe.big_cqe[0] = extra1; - ocqe->cqe.big_cqe[1] = extra2; - } list_add_tail(&ocqe->list, &ctx->cq_overflow_list); return true; } -static void io_req_cqe_overflow(struct io_kiocb *req) +static struct io_overflow_cqe *io_alloc_ocqe(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe, gfp_t gfp) { - io_cqring_event_overflow(req->ctx, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); + struct io_overflow_cqe *ocqe; + size_t ocq_size = sizeof(struct io_overflow_cqe); + bool is_cqe32 = (ctx->flags & IORING_SETUP_CQE32); + + if (is_cqe32) + ocq_size += sizeof(struct io_uring_cqe); + + ocqe = kzalloc(ocq_size, gfp | __GFP_ACCOUNT); + trace_io_uring_cqe_overflow(ctx, cqe->user_data, cqe->res, cqe->flags, ocqe); + if (ocqe) { + ocqe->cqe.user_data = cqe->user_data; + ocqe->cqe.res = cqe->res; + ocqe->cqe.flags = cqe->flags; + if (is_cqe32 && big_cqe) { + ocqe->cqe.big_cqe[0] = big_cqe->extra1; + ocqe->cqe.big_cqe[1] = big_cqe->extra2; + } + } + if (big_cqe) + big_cqe->extra1 = big_cqe->extra2 = 0; + return ocqe; } /* @@ -812,16 +798,7 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, { struct io_uring_cqe *cqe; - ctx->cq_extra++; - - /* - * If we can't get a cq entry, userspace overflowed the - * submission (by quite a lot). Increment the overflow count in - * the ring. - */ if (likely(io_get_cqe(ctx, &cqe))) { - trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0); - WRITE_ONCE(cqe->user_data, user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); @@ -830,21 +807,37 @@ static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, WRITE_ONCE(cqe->big_cqe[0], 0); WRITE_ONCE(cqe->big_cqe[1], 0); } + + trace_io_uring_complete(ctx, NULL, cqe); return true; } return false; } -static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, - u32 cflags) +static inline struct io_cqe io_init_cqe(u64 user_data, s32 res, u32 cflags) { - bool filled; + return (struct io_cqe) { .user_data = user_data, .res = res, .flags = cflags }; +} - filled = io_fill_cqe_aux(ctx, user_data, res, cflags); - if (!filled) - filled = io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); +static __cold void io_cqe_overflow(struct io_ring_ctx *ctx, struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; - return filled; + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_KERNEL); + spin_lock(&ctx->completion_lock); + io_cqring_add_overflow(ctx, ocqe); + spin_unlock(&ctx->completion_lock); +} + +static __cold bool io_cqe_overflow_locked(struct io_ring_ctx *ctx, + struct io_cqe *cqe, + struct io_big_cqe *big_cqe) +{ + struct io_overflow_cqe *ocqe; + + ocqe = io_alloc_ocqe(ctx, cqe, big_cqe, GFP_ATOMIC); + return io_cqring_add_overflow(ctx, ocqe); } bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) @@ -852,7 +845,12 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags bool filled; io_cq_lock(ctx); - filled = __io_post_aux_cqe(ctx, user_data, res, cflags); + filled = io_fill_cqe_aux(ctx, user_data, res, cflags); + if (unlikely(!filled)) { + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + filled = io_cqe_overflow_locked(ctx, &cqe, NULL); + } io_cq_unlock_post(ctx); return filled; } @@ -863,10 +861,13 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags */ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) { + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert(ctx->lockless_cq); + if (!io_fill_cqe_aux(ctx, user_data, res, cflags)) { - spin_lock(&ctx->completion_lock); - io_cqring_event_overflow(ctx, user_data, res, cflags, 0, 0); - spin_unlock(&ctx->completion_lock); + struct io_cqe cqe = io_init_cqe(user_data, res, cflags); + + io_cqe_overflow(ctx, &cqe, NULL); } ctx->submit_state.cq_flush = true; } @@ -880,19 +881,33 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) struct io_ring_ctx *ctx = req->ctx; bool posted; + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - __io_cq_lock(ctx); - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + } + ctx->submit_state.cq_flush = true; - __io_cq_unlock_post(ctx); return posted; } static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool completed = true; /* * All execution paths but io-wq use the deferred completions by @@ -905,19 +920,21 @@ static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags) * Handle special CQ sync cases via task_work. DEFER_TASKRUN requires * the submitter task context, IOPOLL protects with uring_lock. */ - if (ctx->task_complete || (ctx->flags & IORING_SETUP_IOPOLL)) { + if (ctx->lockless_cq || (req->flags & REQ_F_REISSUE)) { +defer_complete: req->io_task_work.func = io_req_task_complete; io_req_task_work_add(req); return; } io_cq_lock(ctx); - if (!(req->flags & REQ_F_CQE_SKIP)) { - if (!io_fill_cqe_req(ctx, req)) - io_req_cqe_overflow(req); - } + if (!(req->flags & REQ_F_CQE_SKIP)) + completed = io_fill_cqe_req(ctx, req); io_cq_unlock_post(ctx); + if (!completed) + goto defer_complete; + /* * We don't free the request here because we know it's called from * io-wq only, which holds a reference, so it cannot be the last put. @@ -940,20 +957,6 @@ void io_req_defer_failed(struct io_kiocb *req, s32 res) } /* - * Don't initialise the fields below on every allocation, but do that in - * advance and keep them valid across allocations. - */ -static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - memset(&req->cqe, 0, sizeof(req->cqe)); - memset(&req->big_cqe, 0, sizeof(req->big_cqe)); -} - -/* * A request might get retired back into the request caches even before opcode * handlers and io_issue_sqe() are done with it, e.g. inline completion path. * Because of that, io_alloc_req() should be called only under ->uring_lock @@ -962,7 +965,7 @@ static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO; void *reqs[IO_REQ_ALLOC_BATCH]; int ret; @@ -980,10 +983,11 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) } percpu_ref_get_many(&ctx->refs, ret); + ctx->nr_req_allocated += ret; + while (ret--) { struct io_kiocb *req = reqs[ret]; - io_preinit_req(req, ctx); io_req_add_to_cache(req, ctx); } return true; @@ -1025,7 +1029,7 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return nxt; } -static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, io_tw_token_t tw) { if (!ctx) return; @@ -1055,69 +1059,62 @@ struct llist_node *io_handle_tw_list(struct llist_node *node, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = req->ctx; mutex_lock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } INDIRECT_CALL_2(req->io_task_work.func, io_poll_task_func, io_req_rw_complete, - req, &ts); + req, ts); node = next; (*count)++; if (unlikely(need_resched())) { - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); ctx = NULL; cond_resched(); } } while (node && *count < max_entries); - ctx_flush_and_put(ctx, &ts); + ctx_flush_and_put(ctx, ts); return node; } -/** - * io_llist_xchg - swap all entries in a lock-less list - * @head: the head of lock-less list to delete all entries - * @new: new entry as the head of the list - * - * If list is empty, return NULL, otherwise, return the pointer to the first entry. - * The order of entries returned is from the newest to the oldest added one. - */ -static inline struct llist_node *io_llist_xchg(struct llist_head *head, - struct llist_node *new) -{ - return xchg(&head->first, new); -} - -static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync) +static __cold void __io_fallback_tw(struct llist_node *node, bool sync) { - struct llist_node *node = llist_del_all(&tctx->task_list); struct io_ring_ctx *last_ctx = NULL; struct io_kiocb *req; while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (sync && last_ctx != req->ctx) { + if (last_ctx != req->ctx) { if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); } if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } +static void io_fallback_tw(struct io_uring_task *tctx, bool sync) +{ + struct llist_node *node = llist_del_all(&tctx->task_list); + + __io_fallback_tw(node, sync); +} + struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count) @@ -1155,10 +1152,9 @@ void tctx_task_work(struct callback_head *cb) WARN_ON_ONCE(ret); } -static inline void io_req_local_work_add(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned flags) +static void io_req_local_work_add(struct io_kiocb *req, unsigned flags) { + struct io_ring_ctx *ctx = req->ctx; unsigned nr_wait, nr_tw, nr_tw_prev; struct llist_node *head; @@ -1169,7 +1165,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, * We don't know how many reuqests is there in the link and whether * they can even be queued lazily, fall back to non-lazy. */ - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + if (req->flags & IO_REQ_LINK_FLAGS) flags &= ~IOU_F_TWQ_LAZY_WAKE; guard(rcu)(); @@ -1213,7 +1209,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_or(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); if (ctx->has_evfd) - io_eventfd_signal(ctx); + io_eventfd_signal(ctx, false); } nr_wait = atomic_read(&ctx->cq_wait_nr); @@ -1228,7 +1224,7 @@ static inline void io_req_local_work_add(struct io_kiocb *req, static void io_req_normal_work_add(struct io_kiocb *req) { - struct io_uring_task *tctx = req->task->io_uring; + struct io_uring_task *tctx = req->tctx; struct io_ring_ctx *ctx = req->ctx; /* task_work already pending, we're done */ @@ -1240,14 +1236,11 @@ static void io_req_normal_work_add(struct io_kiocb *req) /* SQPOLL doesn't need the task_work added, it'll run it itself */ if (ctx->flags & IORING_SETUP_SQPOLL) { - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd->thread) - __set_notify_signal(sqd->thread); + __set_notify_signal(tctx->task); return; } - if (likely(!task_work_add(req->task, &tctx->task_work, ctx->notify_method))) + if (likely(!task_work_add(tctx->task, &tctx->task_work, ctx->notify_method))) return; io_fallback_tw(tctx, false); @@ -1256,37 +1249,31 @@ static void io_req_normal_work_add(struct io_kiocb *req) void __io_req_task_work_add(struct io_kiocb *req, unsigned flags) { if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN) - io_req_local_work_add(req, req->ctx, flags); + io_req_local_work_add(req, flags); else io_req_normal_work_add(req); } -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags) +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags) { - if (WARN_ON_ONCE(!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))) + if (WARN_ON_ONCE(!(req->ctx->flags & IORING_SETUP_DEFER_TASKRUN))) return; - io_req_local_work_add(req, ctx, flags); + __io_req_task_work_add(req, flags); } static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) { - struct llist_node *node; - - node = llist_del_all(&ctx->work_llist); - while (node) { - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); + struct llist_node *node = llist_del_all(&ctx->work_llist); - node = node->next; - io_req_normal_work_add(req); - } + __io_fallback_tw(node, false); + node = llist_del_all(&ctx->retry_llist); + __io_fallback_tw(node, false); } static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, int min_events) { - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return false; if (events < min_events) return true; @@ -1295,8 +1282,29 @@ static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events, return false; } -static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, - int min_events) +static int __io_run_local_work_loop(struct llist_node **node, + io_tw_token_t tw, + int events) +{ + int ret = 0; + + while (*node) { + struct llist_node *next = (*node)->next; + struct io_kiocb *req = container_of(*node, struct io_kiocb, + io_task_work.node); + INDIRECT_CALL_2(req->io_task_work.func, + io_poll_task_func, io_req_rw_complete, + req, tw); + *node = next; + if (++ret >= events) + break; + } + + return ret; +} + +static int __io_run_local_work(struct io_ring_ctx *ctx, io_tw_token_t tw, + int min_events, int max_events) { struct llist_node *node; unsigned int loops = 0; @@ -1307,25 +1315,23 @@ static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts, if (ctx->flags & IORING_SETUP_TASKRUN_FLAG) atomic_andnot(IORING_SQ_TASKRUN, &ctx->rings->sq_flags); again: + min_events -= ret; + ret = __io_run_local_work_loop(&ctx->retry_llist.first, tw, max_events); + if (ctx->retry_llist.first) + goto retry_done; + /* * llists are in reverse order, flip it back the right way before * running the pending items. */ - node = llist_reverse_order(io_llist_xchg(&ctx->work_llist, NULL)); - while (node) { - struct llist_node *next = node->next; - struct io_kiocb *req = container_of(node, struct io_kiocb, - io_task_work.node); - INDIRECT_CALL_2(req->io_task_work.func, - io_poll_task_func, io_req_rw_complete, - req, ts); - ret++; - node = next; - } + node = llist_reverse_order(llist_del_all(&ctx->work_llist)); + ret += __io_run_local_work_loop(&node, tw, max_events - ret); + ctx->retry_llist.first = node; loops++; if (io_run_local_work_continue(ctx, ret, min_events)) goto again; +retry_done: io_submit_flush_completions(ctx); if (io_run_local_work_continue(ctx, ret, min_events)) goto again; @@ -1339,33 +1345,34 @@ static inline int io_run_local_work_locked(struct io_ring_ctx *ctx, { struct io_tw_state ts = {}; - if (llist_empty(&ctx->work_llist)) + if (!io_local_work_pending(ctx)) return 0; - return __io_run_local_work(ctx, &ts, min_events); + return __io_run_local_work(ctx, ts, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); } -static int io_run_local_work(struct io_ring_ctx *ctx, int min_events) +static int io_run_local_work(struct io_ring_ctx *ctx, int min_events, + int max_events) { struct io_tw_state ts = {}; int ret; mutex_lock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, &ts, min_events); + ret = __io_run_local_work(ctx, ts, min_events, max_events); mutex_unlock(&ctx->uring_lock); return ret; } -static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_cancel(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); io_req_defer_failed(req, req->cqe.res); } -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + io_tw_lock(req->ctx, tw); + if (unlikely(io_should_terminate_tw())) io_req_defer_failed(req, -EFAULT); else if (req->flags & REQ_F_FORCE_ASYNC) io_queue_iowq(req); @@ -1394,6 +1401,16 @@ void io_queue_next(struct io_kiocb *req) io_req_task_queue(nxt); } +static inline void io_req_put_rsrc_nodes(struct io_kiocb *req) +{ + if (req->file_node) { + io_put_rsrc_node(req->ctx, req->file_node); + req->file_node = NULL; + } + if (req->flags & REQ_F_BUF_NODE) + io_put_rsrc_node(req->ctx, req->buf_node); +} + static void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node) __must_hold(&ctx->uring_lock) @@ -1403,6 +1420,12 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, comp_list); if (unlikely(req->flags & IO_REQ_CLEAN_SLOW_FLAGS)) { + if (req->flags & REQ_F_REISSUE) { + node = req->comp_list.next; + req->flags &= ~REQ_F_REISSUE; + io_queue_iowq(req); + continue; + } if (req->flags & REQ_F_REFCOUNT) { node = req->comp_list.next; if (!req_ref_put_and_test(req)) @@ -1413,8 +1436,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, if (apoll->double_poll) kfree(apoll->double_poll); - if (!io_alloc_cache_put(&ctx->apoll_cache, apoll)) - kfree(apoll); + io_cache_free(&ctx->apoll_cache, apoll); req->flags &= ~REQ_F_POLLED; } if (req->flags & IO_REQ_LINK_FLAGS) @@ -1423,8 +1445,8 @@ static void io_free_batch_list(struct io_ring_ctx *ctx, io_clean_op(req); } io_put_file(req); - io_put_rsrc_node(ctx, req->rsrc_node); - io_put_task(req->task); + io_req_put_rsrc_nodes(req); + io_put_task(req); node = req->comp_list.next; io_req_add_to_cache(req, ctx); @@ -1442,15 +1464,17 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) struct io_kiocb *req = container_of(node, struct io_kiocb, comp_list); - if (!(req->flags & REQ_F_CQE_SKIP) && + /* + * Requests marked with REQUEUE should not post a CQE, they + * will go through the io-wq retry machinery and post one + * later. + */ + if (!(req->flags & (REQ_F_CQE_SKIP | REQ_F_REISSUE)) && unlikely(!io_fill_cqe_req(ctx, req))) { - if (ctx->lockless_cq) { - spin_lock(&ctx->completion_lock); - io_req_cqe_overflow(req); - spin_unlock(&ctx->completion_lock); - } else { - io_req_cqe_overflow(req); - } + if (ctx->lockless_cq) + io_cqe_overflow(ctx, &req->cqe, &req->big_cqe); + else + io_cqe_overflow_locked(ctx, &req->cqe, &req->big_cqe); } } __io_cq_unlock_post(ctx); @@ -1459,6 +1483,10 @@ void __io_submit_flush_completions(struct io_ring_ctx *ctx) io_free_batch_list(ctx, state->compl_reqs.first); INIT_WQ_LIST(&state->compl_reqs); } + + if (unlikely(ctx->drain_active)) + io_queue_deferred(ctx); + ctx->submit_state.cq_flush = false; } @@ -1495,13 +1523,18 @@ static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) } } mutex_unlock(&ctx->uring_lock); + + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + io_move_task_work_from_local(ctx); } -static int io_iopoll_check(struct io_ring_ctx *ctx, long min) +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned int min_events) { unsigned int nr_events = 0; unsigned long check_cq; + min_events = min(min_events, ctx->cq_entries); + lockdep_assert_held(&ctx->uring_lock); if (!io_allowed_run_tw(ctx)) @@ -1543,7 +1576,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - (void) io_run_local_work_locked(ctx, min); + (void) io_run_local_work_locked(ctx, min_events); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { @@ -1556,7 +1589,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, !min); + ret = io_do_iopoll(ctx, !min_events); if (unlikely(ret < 0)) return ret; @@ -1566,12 +1599,12 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) break; nr_events += ret; - } while (nr_events < min); + } while (nr_events < min_events); return 0; } -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw) { io_req_complete_defer(req); } @@ -1635,6 +1668,8 @@ io_req_flags_t io_file_get_flags(struct file *file) { io_req_flags_t res = 0; + BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1); + if (S_ISREG(file_inode(file)->i_mode)) res |= REQ_F_ISREG; if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT)) @@ -1642,69 +1677,28 @@ io_req_flags_t io_file_get_flags(struct file *file) return res; } -bool io_alloc_async_data(struct io_kiocb *req) -{ - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - - WARN_ON_ONCE(!def->async_size); - req->async_data = kmalloc(def->async_size, GFP_KERNEL); - if (req->async_data) { - req->flags |= REQ_F_ASYNC_DATA; - return false; - } - return true; -} - -static u32 io_get_sequence(struct io_kiocb *req) -{ - u32 seq = req->ctx->cached_sq_head; - struct io_kiocb *cur; - - /* need original cached_sq_head, but it was increased for each req */ - io_for_each_link(cur, req) - seq--; - return seq; -} - static __cold void io_drain_req(struct io_kiocb *req) __must_hold(&ctx->uring_lock) { struct io_ring_ctx *ctx = req->ctx; + bool drain = req->flags & IOSQE_IO_DRAIN; struct io_defer_entry *de; - int ret; - u32 seq = io_get_sequence(req); - /* Still need defer if there is pending req in defer list. */ - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); -queue: - ctx->drain_active = false; - io_req_task_queue(req); - return; - } - spin_unlock(&ctx->completion_lock); - - io_prep_async_link(req); - de = kmalloc(sizeof(*de), GFP_KERNEL); + de = kmalloc(sizeof(*de), GFP_KERNEL_ACCOUNT); if (!de) { - ret = -ENOMEM; - io_req_defer_failed(req, ret); + io_req_defer_failed(req, -ENOMEM); return; } - spin_lock(&ctx->completion_lock); - if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock(&ctx->completion_lock); - kfree(de); - goto queue; - } - + io_prep_async_link(req); trace_io_uring_defer(req); de->req = req; - de->seq = seq; + + ctx->nr_drained += io_linked_nr(req); list_add_tail(&de->list, &ctx->defer_list); - spin_unlock(&ctx->completion_lock); + io_queue_deferred(ctx); + if (!drain && list_empty(&ctx->defer_list)) + ctx->drain_active = false; } static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, @@ -1721,17 +1715,22 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def, return !!req->file; } -static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + +static inline int __io_issue_sqe(struct io_kiocb *req, + unsigned int issue_flags, + const struct io_issue_def *def) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; - if (unlikely(!io_assign_file(req, def, issue_flags))) - return -EBADF; - - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1741,10 +1740,27 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } + + return ret; +} + +static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + int ret; + + if (unlikely(!io_assign_file(req, def, issue_flags))) + return -EBADF; + + ret = __io_issue_sqe(req, issue_flags, def); - if (ret == IOU_OK) { + if (ret == IOU_COMPLETE) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_defer(req); else @@ -1755,7 +1771,6 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (ret == IOU_ISSUE_SKIP_COMPLETE) { ret = 0; - io_arm_ltimeout(req); /* If the op doesn't have a file, we're not polling for it */ if ((req->ctx->flags & IORING_SETUP_IOPOLL) && def->iopoll_queue) @@ -1764,11 +1779,23 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return ret; } -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts) +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw) { - io_tw_lock(req->ctx, ts); - return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT| - IO_URING_F_COMPLETE_DEFER); + const unsigned int issue_flags = IO_URING_F_NONBLOCK | + IO_URING_F_MULTISHOT | + IO_URING_F_COMPLETE_DEFER; + int ret; + + io_tw_lock(req->ctx, tw); + + WARN_ON_ONCE(!req->file); + if (WARN_ON_ONCE(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EFAULT; + + ret = __io_issue_sqe(req, issue_flags, &io_issue_defs[req->opcode]); + + WARN_ON_ONCE(ret == IOU_ISSUE_SKIP_COMPLETE); + return ret; } struct io_wq_work *io_wq_free_work(struct io_wq_work *work) @@ -1776,7 +1803,7 @@ struct io_wq_work *io_wq_free_work(struct io_wq_work *work) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *nxt = NULL; - if (req_ref_put_and_test(req)) { + if (req_ref_put_and_test_atomic(req)) { if (req->flags & IO_REQ_LINK_FLAGS) nxt = io_req_find_next(req); io_free_req(req); @@ -1792,14 +1819,12 @@ void io_wq_submit_work(struct io_wq_work *work) bool needs_poll = false; int ret = 0, err = -ECANCELED; - /* one will be dropped by ->io_wq_free_work() after returning to io-wq */ + /* one will be dropped by io_wq_free_work() after returning to io-wq */ if (!(req->flags & REQ_F_REFCOUNT)) __io_req_set_refcount(req, 2); else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (atomic_read(&work->flags) & IO_WQ_WORK_CANCEL) { fail: @@ -1820,7 +1845,7 @@ fail: * Don't allow any multishot execution from io-wq. It's more restrictive * than necessary and also cleaner. */ - if (req->flags & REQ_F_APOLL_MULTISHOT) { + if (req->flags & (REQ_F_MULTISHOT|REQ_F_APOLL_MULTISHOT)) { err = -EBADFD; if (!io_file_can_poll(req)) goto fail; @@ -1831,7 +1856,7 @@ fail: goto fail; return; } else { - req->flags &= ~REQ_F_APOLL_MULTISHOT; + req->flags &= ~(REQ_F_APOLL_MULTISHOT|REQ_F_MULTISHOT); } } @@ -1886,20 +1911,17 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - struct io_fixed_file *slot; + struct io_rsrc_node *node; struct file *file = NULL; io_ring_submit_lock(ctx, issue_flags); - - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - goto out; - fd = array_index_nospec(fd, ctx->nr_user_files); - slot = io_fixed_file_slot(&ctx->file_table, fd); - if (!req->rsrc_node) - __io_req_set_rsrc_node(req, ctx); - req->flags |= io_slot_flags(slot); - file = io_slot_file(slot); -out: + node = io_rsrc_node_lookup(&ctx->file_table.data, fd); + if (node) { + node->refs++; + req->file_node = node; + req->flags |= io_slot_flags(node); + file = io_slot_file(node); + } io_ring_submit_unlock(ctx, issue_flags); return file; } @@ -1919,15 +1941,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_defer_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1940,9 +1958,6 @@ static void io_queue_async(struct io_kiocb *req, int ret) case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) @@ -2002,9 +2017,8 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } -static void io_init_req_drain(struct io_kiocb *req) +static void io_init_drain(struct io_ring_ctx *ctx) { - struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *head = ctx->submit_state.link.head; ctx->drain_active = true; @@ -2037,21 +2051,22 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, int personality; u8 opcode; - /* req is partially pre-initialised, see io_preinit_req() */ + req->ctx = ctx; req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ sqe_flags = READ_ONCE(sqe->flags); req->flags = (__force io_req_flags_t) sqe_flags; req->cqe.user_data = READ_ONCE(sqe->user_data); req->file = NULL; - req->rsrc_node = NULL; - req->task = current; + req->tctx = current->io_uring; req->cancel_seq_set = false; if (unlikely(opcode >= IORING_OP_LAST)) { req->opcode = 0; return io_init_fail_req(req, -EINVAL); } + opcode = array_index_nospec(opcode, IORING_OP_LAST); + def = &io_issue_defs[opcode]; if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { /* enforce forwards compatibility on users */ @@ -2067,7 +2082,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, if (sqe_flags & IOSQE_IO_DRAIN) { if (ctx->drain_disabled) return io_init_fail_req(req, -EOPNOTSUPP); - io_init_req_drain(req); + io_init_drain(ctx); } } if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { @@ -2263,17 +2278,15 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe) unsigned mask = ctx->sq_entries - 1; unsigned head = ctx->cached_sq_head++ & mask; - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) { + if (static_branch_unlikely(&io_key_has_sqarray) && + (!(ctx->flags & IORING_SETUP_NO_SQARRAY))) { head = READ_ONCE(ctx->sq_array[head]); if (unlikely(head >= ctx->sq_entries)) { - /* drop invalid entries */ - spin_lock(&ctx->completion_lock); - ctx->cq_extra--; - spin_unlock(&ctx->completion_lock); WRITE_ONCE(ctx->rings->sq_dropped, READ_ONCE(ctx->rings->sq_dropped) + 1); return false; } + head = array_index_nospec(head, ctx->sq_entries); } /* @@ -2358,9 +2371,9 @@ static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, int io_run_task_work_sig(struct io_ring_ctx *ctx) { - if (!llist_empty(&ctx->work_llist)) { + if (io_local_work_pending(ctx)) { __set_current_state(TASK_RUNNING); - if (io_run_local_work(ctx, INT_MAX) > 0) + if (io_run_local_work(ctx, INT_MAX, IO_LOCAL_TW_DEFAULT_MAX) > 0) return 0; } if (io_run_task_work() > 0) @@ -2426,7 +2439,7 @@ static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer) goto out_wake; } - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_update_function(&iowq->t, io_cqring_timer_wakeup); hrtimer_set_expires(timer, iowq->timeout); return HRTIMER_RESTART; out_wake: @@ -2438,13 +2451,14 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, { ktime_t timeout; - hrtimer_init_on_stack(&iowq->t, clock_id, HRTIMER_MODE_ABS); if (iowq->min_timeout) { timeout = ktime_add_ns(iowq->min_timeout, start_time); - iowq->t.function = io_cqring_min_timer_wakeup; + hrtimer_setup_on_stack(&iowq->t, io_cqring_min_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); } else { timeout = iowq->timeout; - iowq->t.function = io_cqring_timer_wakeup; + hrtimer_setup_on_stack(&iowq->t, io_cqring_timer_wakeup, clock_id, + HRTIMER_MODE_ABS); } hrtimer_set_expires_range_ns(&iowq->t, timeout, 0); @@ -2460,8 +2474,18 @@ static int io_cqring_schedule_timeout(struct io_wait_queue *iowq, return READ_ONCE(iowq->hit_timeout) ? -ETIME : 0; } +struct ext_arg { + size_t argsz; + struct timespec64 ts; + const sigset_t __user *sig; + ktime_t min_time; + bool ts_set; + bool iowait; +}; + static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { int ret = 0; @@ -2471,7 +2495,7 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, * can take into account that the task is waiting for IO - turns out * to be important for low QD IO. */ - if (current_pending_io()) + if (ext_arg->iowait && current_pending_io()) current->in_iowait = 1; if (iowq->timeout != KTIME_MAX || iowq->min_timeout) ret = io_cqring_schedule_timeout(iowq, ctx->clockid, start_time); @@ -2484,11 +2508,12 @@ static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx, /* If this returns > 0, the caller should retry */ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct ext_arg *ext_arg, ktime_t start_time) { if (unlikely(READ_ONCE(ctx->check_cq))) return 1; - if (unlikely(!llist_empty(&ctx->work_llist))) + if (unlikely(io_local_work_pending(ctx))) return 1; if (unlikely(task_work_pending(current))) return 1; @@ -2497,16 +2522,9 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, if (unlikely(io_should_wake(iowq))) return 0; - return __io_cqring_wait_schedule(ctx, iowq, start_time); + return __io_cqring_wait_schedule(ctx, iowq, ext_arg, start_time); } -struct ext_arg { - size_t argsz; - struct __kernel_timespec __user *ts; - const sigset_t __user *sig; - ktime_t min_time; -}; - /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2519,10 +2537,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, ktime_t start_time; int ret; + min_events = min_t(int, min_events, ctx->cq_entries); + if (!io_allowed_run_tw(ctx)) return -EEXIST; - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, min_events); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, min_events, + max(IO_LOCAL_TW_DEFAULT_MAX, min_events)); io_run_task_work(); if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq))) @@ -2542,13 +2563,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, iowq.timeout = KTIME_MAX; start_time = io_get_time(ctx); - if (ext_arg->ts) { - struct timespec64 ts; - - if (get_timespec64(&ts, ext_arg->ts)) - return -EFAULT; - - iowq.timeout = timespec64_to_ktime(ts); + if (ext_arg->ts_set) { + iowq.timeout = timespec64_to_ktime(ext_arg->ts); if (!(flags & IORING_ENTER_ABS_TIMER)) iowq.timeout = ktime_add(iowq.timeout, start_time); } @@ -2588,7 +2604,7 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, TASK_INTERRUPTIBLE); } - ret = io_cqring_wait_schedule(ctx, &iowq, start_time); + ret = io_cqring_wait_schedule(ctx, &iowq, ext_arg, start_time); __set_current_state(TASK_RUNNING); atomic_set(&ctx->cq_wait_nr, IO_CQ_WAKE_INIT); @@ -2597,8 +2613,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, * If we got woken because of task_work being processed, run it * now rather than let the caller do another wait loop. */ - if (!llist_empty(&ctx->work_llist)) - io_run_local_work(ctx, nr_wait); + if (io_local_work_pending(ctx)) + io_run_local_work(ctx, nr_wait, nr_wait); io_run_task_work(); /* @@ -2638,42 +2654,16 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; - vunmap(ctx->rings); - vunmap(ctx->sq_sqes); - } - + io_free_region(ctx, &ctx->sq_region); + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } -static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries, - unsigned int cq_entries, size_t *sq_offset) +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset) { struct io_rings *rings; size_t off, sq_array_size; @@ -2681,7 +2671,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries off = struct_size(rings, cqes, cq_entries); if (off == SIZE_MAX) return SIZE_MAX; - if (ctx->flags & IORING_SETUP_CQE32) { + if (flags & IORING_SETUP_CQE32) { if (check_shl_overflow(off, 1, &off)) return SIZE_MAX; } @@ -2692,7 +2682,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return SIZE_MAX; #endif - if (ctx->flags & IORING_SETUP_NO_SQARRAY) { + if (flags & IORING_SETUP_NO_SQARRAY) { *sq_offset = SIZE_MAX; return off; } @@ -2709,72 +2699,68 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries return off; } -static void io_req_caches_free(struct io_ring_ctx *ctx) +static __cold void __io_req_caches_free(struct io_ring_ctx *ctx) { struct io_kiocb *req; int nr = 0; - mutex_lock(&ctx->uring_lock); - while (!io_req_cache_empty(ctx)) { req = io_extract_req(ctx); kmem_cache_free(req_cachep, req); nr++; } - if (nr) + if (nr) { + ctx->nr_req_allocated -= nr; percpu_ref_put_many(&ctx->refs, nr); - mutex_unlock(&ctx->uring_lock); + } +} + +static __cold void io_req_caches_free(struct io_ring_ctx *ctx) +{ + guard(mutex)(&ctx->uring_lock); + __io_req_caches_free(ctx); } static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); - /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ - if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list))) - return; mutex_lock(&ctx->uring_lock); - if (ctx->buf_data) - __io_sqe_buffers_unregister(ctx); - if (ctx->file_data) - __io_sqe_files_unregister(ctx); + io_sqe_buffers_unregister(ctx); + io_sqe_files_unregister(ctx); + io_unregister_zcrx_ifqs(ctx); io_cqring_overflow_kill(ctx); io_eventfd_unregister(ctx); - io_alloc_cache_free(&ctx->apoll_cache, kfree); - io_alloc_cache_free(&ctx->netmsg_cache, io_netmsg_cache_free); - io_alloc_cache_free(&ctx->rw_cache, io_rw_cache_free); - io_alloc_cache_free(&ctx->uring_cache, kfree); - io_alloc_cache_free(&ctx->msg_cache, io_msg_cache_free); - io_futex_cache_free(ctx); + io_free_alloc_caches(ctx); io_destroy_buffers(ctx); + io_free_region(ctx, &ctx->param_region); mutex_unlock(&ctx->uring_lock); if (ctx->sq_creds) put_cred(ctx->sq_creds); if (ctx->submitter_task) put_task_struct(ctx->submitter_task); - /* there are no registered resources left, nobody uses it */ - if (ctx->rsrc_node) - io_rsrc_node_destroy(ctx, ctx->rsrc_node); - - WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); - io_alloc_cache_free(&ctx->rsrc_node_cache, kfree); if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } io_rings_free(ctx); + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_dec(&io_key_has_sqarray); + percpu_ref_exit(&ctx->refs); free_uid(ctx->user); io_req_caches_free(ctx); + + WARN_ON_ONCE(ctx->nr_req_allocated); + if (ctx->hash_map) io_wq_put_hash(ctx->hash_map); io_napi_free(ctx); - kfree(ctx->cancel_table.hbs); - kfree(ctx->cancel_table_locked.hbs); + kvfree(ctx->cancel_table.hbs); xa_destroy(&ctx->io_bl_xa); kfree(ctx); } @@ -2825,13 +2811,12 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (unlikely(!ctx->poll_activated)) io_activate_pollwq(ctx); - - poll_wait(file, &ctx->poll_wq, wait); /* - * synchronizes with barrier from wq_has_sleeper call in - * io_commit_cqring + * provides mb() which pairs with barrier from wq_has_sleeper + * call in io_commit_cqring */ - smp_rmb(); + poll_wait(file, &ctx->poll_wq, wait); + if (!io_sqring_full(ctx)) mask |= EPOLLOUT | EPOLLWRNORM; @@ -2906,11 +2891,17 @@ static __cold void io_ring_exit_work(struct work_struct *work) io_cqring_overflow_kill(ctx); mutex_unlock(&ctx->uring_lock); } + if (!xa_empty(&ctx->zcrx_ctxs)) { + mutex_lock(&ctx->uring_lock); + io_shutdown_zcrx_ifqs(ctx); + mutex_unlock(&ctx->uring_lock); + } if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) io_move_task_work_from_local(ctx); - while (io_uring_try_cancel_requests(ctx, NULL, true)) + /* The SQPOLL thread never reaches this path */ + while (io_uring_try_cancel_requests(ctx, NULL, true, false)) cond_resched(); if (ctx->sq_data) { @@ -2918,7 +2909,7 @@ static __cold void io_ring_exit_work(struct work_struct *work) struct task_struct *tsk; io_sq_thread_park(sqd); - tsk = sqd->thread; + tsk = sqpoll_task_locked(sqd); if (tsk && tsk->io_uring && tsk->io_uring->io_wq) io_wq_cancel_cb(tsk->io_uring->io_wq, io_cancel_ctx_cb, ctx, true); @@ -3013,7 +3004,7 @@ static int io_uring_release(struct inode *inode, struct file *file) } struct io_task_cancel { - struct task_struct *task; + struct io_uring_task *tctx; bool all; }; @@ -3022,30 +3013,29 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_task_cancel *cancel = data; - return io_match_task_safe(req, cancel->task, cancel->all); + return io_match_task_safe(req, cancel->tctx, cancel->all); } static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, + struct io_uring_task *tctx, bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); - spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_task_safe(de->req, task, cancel_all)) { + if (io_match_task_safe(de->req, tctx, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; while (!list_empty(&list)) { de = list_first_entry(&list, struct io_defer_entry, list); list_del_init(&de->list); + ctx->nr_drained -= io_linked_nr(de->req); io_req_task_queue_fail(de->req, -ECANCELED); kfree(de); } @@ -3077,11 +3067,11 @@ static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) } static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) + struct io_uring_task *tctx, + bool cancel_all, + bool is_sqpoll_thread) { - struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; - struct io_uring_task *tctx = task ? task->io_uring : NULL; + struct io_task_cancel cancel = { .tctx = tctx, .all = cancel_all, }; enum io_wq_cancel cret; bool ret = false; @@ -3095,9 +3085,9 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if (!ctx->rings) return false; - if (!task) { + if (!tctx) { ret |= io_uring_try_cancel_iowq(ctx); - } else if (tctx && tctx->io_wq) { + } else if (tctx->io_wq) { /* * Cancels requests of all rings, not only @ctx, but * it's fine as the task is in exit/exec. @@ -3109,7 +3099,7 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || - (ctx->sq_data && ctx->sq_data->thread == current)) { + is_sqpoll_thread) { while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; @@ -3119,16 +3109,16 @@ static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx, if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && io_allowed_defer_tw_run(ctx)) - ret |= io_run_local_work(ctx, INT_MAX) > 0; - ret |= io_cancel_defer_files(ctx, task, cancel_all); + ret |= io_run_local_work(ctx, INT_MAX, INT_MAX) > 0; mutex_lock(&ctx->uring_lock); - ret |= io_poll_remove_all(ctx, task, cancel_all); - ret |= io_waitid_remove_all(ctx, task, cancel_all); - ret |= io_futex_remove_all(ctx, task, cancel_all); - ret |= io_uring_try_cancel_uring_cmd(ctx, task, cancel_all); + ret |= io_cancel_defer_files(ctx, tctx, cancel_all); + ret |= io_poll_remove_all(ctx, tctx, cancel_all); + ret |= io_waitid_remove_all(ctx, tctx, cancel_all); + ret |= io_futex_remove_all(ctx, tctx, cancel_all); + ret |= io_uring_try_cancel_uring_cmd(ctx, tctx, cancel_all); mutex_unlock(&ctx->uring_lock); - ret |= io_kill_timeouts(ctx, task, cancel_all); - if (task) + ret |= io_kill_timeouts(ctx, tctx, cancel_all); + if (tctx) ret |= io_run_task_work() > 0; else ret |= flush_delayed_work(&ctx->fallback_work); @@ -3155,7 +3145,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) s64 inflight; DEFINE_WAIT(wait); - WARN_ON_ONCE(sqd && sqd->thread != current); + WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current); if (!current->io_uring) return; @@ -3181,13 +3171,16 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (node->ctx->sq_data) continue; loop |= io_uring_try_cancel_requests(node->ctx, - current, cancel_all); + current->io_uring, + cancel_all, + false); } } else { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) loop |= io_uring_try_cancel_requests(ctx, - current, - cancel_all); + current->io_uring, + cancel_all, + true); } if (loop) { @@ -3199,7 +3192,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) io_run_task_work(); io_uring_drop_tctx_refs(current); xa_for_each(&tctx->xa, index, node) { - if (!llist_empty(&node->ctx->work_llist)) { + if (io_local_work_pending(node->ctx)) { WARN_ON_ONCE(node->ctx->submitter_task && node->ctx->submitter_task != current); goto end_wait; @@ -3230,34 +3223,81 @@ end_wait: void __io_uring_cancel(bool cancel_all) { + io_uring_unreg_ringfd(); io_uring_cancel_generic(cancel_all, NULL); } -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +static struct io_uring_reg_wait *io_get_ext_arg_reg(struct io_ring_ctx *ctx, + const struct io_uring_getevents_arg __user *uarg) { - if (flags & IORING_ENTER_EXT_ARG) { - struct io_uring_getevents_arg arg; + unsigned long size = sizeof(struct io_uring_reg_wait); + unsigned long offset = (uintptr_t)uarg; + unsigned long end; - if (argsz != sizeof(arg)) - return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) - return -EFAULT; - } + if (unlikely(offset % sizeof(long))) + return ERR_PTR(-EFAULT); + + /* also protects from NULL ->cq_wait_arg as the size would be 0 */ + if (unlikely(check_add_overflow(offset, size, &end) || + end > ctx->cq_wait_size)) + return ERR_PTR(-EFAULT); + + offset = array_index_nospec(offset, ctx->cq_wait_size - size); + return ctx->cq_wait_arg + offset; +} + +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, size_t argsz) +{ + struct io_uring_getevents_arg arg; + + if (!(flags & IORING_ENTER_EXT_ARG)) + return 0; + if (flags & IORING_ENTER_EXT_ARG_REG) + return -EINVAL; + if (argsz != sizeof(arg)) + return -EINVAL; + if (copy_from_user(&arg, argp, sizeof(arg))) + return -EFAULT; return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, - struct ext_arg *ext_arg) +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned flags, + const void __user *argp, struct ext_arg *ext_arg) { + const struct io_uring_getevents_arg __user *uarg = argp; struct io_uring_getevents_arg arg; + ext_arg->iowait = !(flags & IORING_ENTER_NO_IOWAIT); + /* * If EXT_ARG isn't set, then we have no timespec and the argp pointer * is just a pointer to the sigset_t. */ if (!(flags & IORING_ENTER_EXT_ARG)) { ext_arg->sig = (const sigset_t __user *) argp; - ext_arg->ts = NULL; + return 0; + } + + if (flags & IORING_ENTER_EXT_ARG_REG) { + struct io_uring_reg_wait *w; + + if (ext_arg->argsz != sizeof(struct io_uring_reg_wait)) + return -EINVAL; + w = io_get_ext_arg_reg(ctx, argp); + if (IS_ERR(w)) + return PTR_ERR(w); + + if (w->flags & ~IORING_REG_WAIT_TS) + return -EINVAL; + ext_arg->min_time = READ_ONCE(w->min_wait_usec) * NSEC_PER_USEC; + ext_arg->sig = u64_to_user_ptr(READ_ONCE(w->sigmask)); + ext_arg->argsz = READ_ONCE(w->sigmask_sz); + if (w->flags & IORING_REG_WAIT_TS) { + ext_arg->ts.tv_sec = READ_ONCE(w->ts.tv_sec); + ext_arg->ts.tv_nsec = READ_ONCE(w->ts.tv_nsec); + ext_arg->ts_set = true; + } return 0; } @@ -3267,13 +3307,32 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, */ if (ext_arg->argsz != sizeof(arg)) return -EINVAL; - if (copy_from_user(&arg, argp, sizeof(arg))) +#ifdef CONFIG_64BIT + if (!user_access_begin(uarg, sizeof(*uarg))) + return -EFAULT; + unsafe_get_user(arg.sigmask, &uarg->sigmask, uaccess_end); + unsafe_get_user(arg.sigmask_sz, &uarg->sigmask_sz, uaccess_end); + unsafe_get_user(arg.min_wait_usec, &uarg->min_wait_usec, uaccess_end); + unsafe_get_user(arg.ts, &uarg->ts, uaccess_end); + user_access_end(); +#else + if (copy_from_user(&arg, uarg, sizeof(arg))) return -EFAULT; +#endif ext_arg->min_time = arg.min_wait_usec * NSEC_PER_USEC; ext_arg->sig = u64_to_user_ptr(arg.sigmask); ext_arg->argsz = arg.sigmask_sz; - ext_arg->ts = u64_to_user_ptr(arg.ts); + if (arg.ts) { + if (get_timespec64(&ext_arg->ts, u64_to_user_ptr(arg.ts))) + return -EFAULT; + ext_arg->ts_set = true; + } return 0; +#ifdef CONFIG_64BIT +uaccess_end: + user_access_end(); + return -EFAULT; +#endif } SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, @@ -3287,7 +3346,9 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG | IORING_ENTER_REGISTERED_RING | - IORING_ENTER_ABS_TIMER))) + IORING_ENTER_ABS_TIMER | + IORING_ENTER_EXT_ARG_REG | + IORING_ENTER_NO_IOWAIT))) return -EINVAL; /* @@ -3370,23 +3431,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked: - ret2 = io_validate_ext_arg(flags, argp, argsz); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); + if (likely(!ret2)) ret2 = io_iopoll_check(ctx, min_complete); - } mutex_unlock(&ctx->uring_lock); } else { struct ext_arg ext_arg = { .argsz = argsz }; - ret2 = io_get_ext_arg(flags, argp, &ext_arg); - if (likely(!ret2)) { - min_complete = min(min_complete, - ctx->cq_entries); + ret2 = io_get_ext_arg(ctx, flags, argp, &ext_arg); + if (likely(!ret2)) ret2 = io_cqring_wait(ctx, min_complete, flags, &ext_arg); - } } if (!ret) { @@ -3429,27 +3484,31 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; ctx->cq_entries = p->cq_entries; - size = rings_size(ctx, p->sq_entries, p->cq_entries, &sq_array_offset); + size = rings_size(ctx->flags, p->sq_entries, p->cq_entries, + &sq_array_offset); if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; @@ -3466,17 +3525,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } - - ctx->sq_sqes = ptr; + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } @@ -3503,14 +3563,46 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) O_RDWR | O_CLOEXEC, NULL); } -static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static int io_uring_sanitise_params(struct io_uring_params *p) { - struct io_ring_ctx *ctx; - struct io_uring_task *tctx; - struct file *file; - int ret; + unsigned flags = p->flags; + + /* There is no way to mmap rings without a real fd */ + if ((flags & IORING_SETUP_REGISTERED_FD_ONLY) && + !(flags & IORING_SETUP_NO_MMAP)) + return -EINVAL; + + if (flags & IORING_SETUP_SQPOLL) { + /* IPI related flags don't make sense with SQPOLL */ + if (flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_TASKRUN_FLAG | + IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + } + + if (flags & IORING_SETUP_TASKRUN_FLAG) { + if (!(flags & (IORING_SETUP_COOP_TASKRUN | + IORING_SETUP_DEFER_TASKRUN))) + return -EINVAL; + } + + /* HYBRID_IOPOLL only valid with IOPOLL */ + if ((flags & IORING_SETUP_HYBRID_IOPOLL) && !(flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + + /* + * For DEFER_TASKRUN we require the completion task to be the same as + * the submission task. This implies that there is only one submitter. + */ + if ((flags & IORING_SETUP_DEFER_TASKRUN) && + !(flags & IORING_SETUP_SINGLE_ISSUER)) + return -EINVAL; + + return 0; +} +int io_uring_fill_params(unsigned entries, struct io_uring_params *p) +{ if (!entries) return -EINVAL; if (entries > IORING_MAX_ENTRIES) { @@ -3519,10 +3611,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, entries = IORING_MAX_ENTRIES; } - if ((p->flags & IORING_SETUP_REGISTERED_FD_ONLY) - && !(p->flags & IORING_SETUP_NO_MMAP)) - return -EINVAL; - /* * Use twice as many entries for the CQ ring. It's possible for the * application to drive a higher depth than the size of the SQ ring, @@ -3552,6 +3640,46 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } + p->sq_off.head = offsetof(struct io_rings, sq.head); + p->sq_off.tail = offsetof(struct io_rings, sq.tail); + p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); + p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); + p->sq_off.flags = offsetof(struct io_rings, sq_flags); + p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); + p->sq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->sq_off.user_addr = 0; + + p->cq_off.head = offsetof(struct io_rings, cq.head); + p->cq_off.tail = offsetof(struct io_rings, cq.tail); + p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); + p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); + p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); + p->cq_off.cqes = offsetof(struct io_rings, cqes); + p->cq_off.flags = offsetof(struct io_rings, cq_flags); + p->cq_off.resv1 = 0; + if (!(p->flags & IORING_SETUP_NO_MMAP)) + p->cq_off.user_addr = 0; + + return 0; +} + +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) +{ + struct io_ring_ctx *ctx; + struct io_uring_task *tctx; + struct file *file; + int ret; + + ret = io_uring_sanitise_params(p); + if (ret) + return ret; + + ret = io_uring_fill_params(entries, p); + if (unlikely(ret)) + return ret; + ctx = io_ring_ctx_alloc(p); if (!ctx) return -ENOMEM; @@ -3559,6 +3687,9 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, ctx->clockid = CLOCK_MONOTONIC; ctx->clock_offset = 0; + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + static_branch_inc(&io_key_has_sqarray); + if ((ctx->flags & IORING_SETUP_DEFER_TASKRUN) && !(ctx->flags & IORING_SETUP_IOPOLL) && !(ctx->flags & IORING_SETUP_SQPOLL)) @@ -3592,32 +3723,10 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, * For SQPOLL, we just need a wakeup, always. For !SQPOLL, if * COOP_TASKRUN is set, then IPIs are never needed by the app. */ - ret = -EINVAL; - if (ctx->flags & IORING_SETUP_SQPOLL) { - /* IPI related flags don't make sense with SQPOLL */ - if (ctx->flags & (IORING_SETUP_COOP_TASKRUN | - IORING_SETUP_TASKRUN_FLAG | - IORING_SETUP_DEFER_TASKRUN)) - goto err; - ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else if (ctx->flags & IORING_SETUP_COOP_TASKRUN) { + if (ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_COOP_TASKRUN)) ctx->notify_method = TWA_SIGNAL_NO_IPI; - } else { - if (ctx->flags & IORING_SETUP_TASKRUN_FLAG && - !(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) - goto err; + else ctx->notify_method = TWA_SIGNAL; - } - - /* - * For DEFER_TASKRUN we require the completion task to be the same as the - * submission task. This implies that there is only one submitter, so enforce - * that. - */ - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN && - !(ctx->flags & IORING_SETUP_SINGLE_ISSUER)) { - goto err; - } /* * This is just grabbed for accounting purposes. When a process exits, @@ -3632,37 +3741,13 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, if (ret) goto err; - ret = io_sq_offload_create(ctx, p); - if (ret) - goto err; + if (!(p->flags & IORING_SETUP_NO_SQARRAY)) + p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - ret = io_rsrc_init(ctx); + ret = io_sq_offload_create(ctx, p); if (ret) goto err; - p->sq_off.head = offsetof(struct io_rings, sq.head); - p->sq_off.tail = offsetof(struct io_rings, sq.tail); - p->sq_off.ring_mask = offsetof(struct io_rings, sq_ring_mask); - p->sq_off.ring_entries = offsetof(struct io_rings, sq_ring_entries); - p->sq_off.flags = offsetof(struct io_rings, sq_flags); - p->sq_off.dropped = offsetof(struct io_rings, sq_dropped); - if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) - p->sq_off.array = (char *)ctx->sq_array - (char *)ctx->rings; - p->sq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->sq_off.user_addr = 0; - - p->cq_off.head = offsetof(struct io_rings, cq.head); - p->cq_off.tail = offsetof(struct io_rings, cq.tail); - p->cq_off.ring_mask = offsetof(struct io_rings, cq_ring_mask); - p->cq_off.ring_entries = offsetof(struct io_rings, cq_ring_entries); - p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); - p->cq_off.cqes = offsetof(struct io_rings, cqes); - p->cq_off.flags = offsetof(struct io_rings, cq_flags); - p->cq_off.resv1 = 0; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - p->cq_off.user_addr = 0; - p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP | IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | @@ -3670,7 +3755,8 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS | IORING_FEAT_RSRC_TAGS | IORING_FEAT_CQE_SKIP | IORING_FEAT_LINKED_FILE | IORING_FEAT_REG_REG_RING | - IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT; + IORING_FEAT_RECVSEND_BUNDLE | IORING_FEAT_MIN_TIMEOUT | + IORING_FEAT_RW_ATTR | IORING_FEAT_NO_IOWAIT; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -3738,35 +3824,42 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) IORING_SETUP_SQE128 | IORING_SETUP_CQE32 | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_NO_MMAP | IORING_SETUP_REGISTERED_FD_ONLY | - IORING_SETUP_NO_SQARRAY)) + IORING_SETUP_NO_SQARRAY | IORING_SETUP_HYBRID_IOPOLL)) return -EINVAL; return io_uring_create(entries, &p, params); } -static inline bool io_uring_allowed(void) +static inline int io_uring_allowed(void) { int disabled = READ_ONCE(sysctl_io_uring_disabled); kgid_t io_uring_group; if (disabled == 2) - return false; + return -EPERM; if (disabled == 0 || capable(CAP_SYS_ADMIN)) - return true; + goto allowed_lsm; io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group); if (!gid_valid(io_uring_group)) - return false; + return -EPERM; - return in_group_p(io_uring_group); + if (!in_group_p(io_uring_group)) + return -EPERM; + +allowed_lsm: + return security_uring_allowed(); } SYSCALL_DEFINE2(io_uring_setup, u32, entries, struct io_uring_params __user *, params) { - if (!io_uring_allowed()) - return -EPERM; + int ret; + + ret = io_uring_allowed(); + if (ret) + return ret; return io_uring_setup(entries, params); } @@ -3776,6 +3869,8 @@ static int __init io_uring_init(void) struct kmem_cache_args kmem_args = { .useroffset = offsetof(struct io_kiocb, cmd.data), .usersize = sizeof_field(struct io_kiocb, cmd.data), + .freeptr_offset = offsetof(struct io_kiocb, work), + .use_freeptr_offset = true, }; #define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \ @@ -3826,9 +3921,13 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_SQE_ELEM(44, __u16, addr_len); + BUILD_BUG_SQE_ELEM(44, __u8, write_stream); + BUILD_BUG_SQE_ELEM(45, __u8, __pad4[0]); BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]); BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd); + BUILD_BUG_SQE_ELEM(48, __u64, attr_ptr); + BUILD_BUG_SQE_ELEM(56, __u64, attr_type_mask); BUILD_BUG_SQE_ELEM(56, __u64, __pad2); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != @@ -3855,6 +3954,9 @@ static int __init io_uring_init(void) io_uring_optable_init(); + /* imu->dir is u8 */ + BUILD_BUG_ON((IO_IMU_DEST | IO_IMU_SOURCE) > U8_MAX); + /* * Allow user copy in the per-command field, which starts after the * file in io_kiocb and until the opcode field. The openat2 handling @@ -3865,10 +3967,9 @@ static int __init io_uring_init(void) req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU); - io_buf_cachep = KMEM_CACHE(io_buffer, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64); + BUG_ON(!iou_wq); #ifdef CONFIG_SYSCTL register_sysctl_init("kernel", kernel_io_uring_disabled_table); diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 70b6675941ff..66c1ca73f55e 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -8,31 +8,35 @@ #include <linux/poll.h> #include <linux/io_uring_types.h> #include <uapi/linux/eventpoll.h> +#include "alloc_cache.h" #include "io-wq.h" #include "slist.h" #include "filetable.h" +#include "opdef.h" #ifndef CREATE_TRACE_POINTS #include <trace/events/io_uring.h> #endif enum { - IOU_OK = 0, + IOU_COMPLETE = 0, + IOU_ISSUE_SKIP_COMPLETE = -EIOCBQUEUED, /* + * The request has more work to do and should be retried. io_uring will + * attempt to wait on the file for eligible opcodes, but otherwise + * it'll be handed to iowq for blocking execution. It works for normal + * requests as well as for the multi shot mode. + */ + IOU_RETRY = -EAGAIN, + + /* * Requeue the task_work to restart operations on this request. The * actual value isn't important, should just be not an otherwise * valid error code, yet less than -MAX_ERRNO and valid internally. */ IOU_REQUEUE = -3072, - - /* - * Intended only when both IO_URING_F_MULTISHOT is passed - * to indicate to the poll runner that multishot should be - * removed and the result is set on req->cqe.res. - */ - IOU_STOP_MULTISHOT = -ECANCELED, }; struct io_wait_queue { @@ -65,6 +69,12 @@ static inline bool io_should_wake(struct io_wait_queue *iowq) return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } +#define IORING_MAX_ENTRIES 32768 +#define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) + +unsigned long rings_size(unsigned int flags, unsigned int sq_entries, + unsigned int cq_entries, size_t *sq_offset); +int io_uring_fill_params(unsigned entries, struct io_uring_params *p); bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow); int io_run_task_work_sig(struct io_ring_ctx *ctx); void io_req_defer_failed(struct io_kiocb *req, s32 res); @@ -73,30 +83,27 @@ void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags) bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags); void __io_commit_cqring_flush(struct io_ring_ctx *ctx); +void io_req_track_inflight(struct io_kiocb *req); struct file *io_file_get_normal(struct io_kiocb *req, int fd); struct file *io_file_get_fixed(struct io_kiocb *req, int fd, unsigned issue_flags); void __io_req_task_work_add(struct io_kiocb *req, unsigned flags); -void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx, - unsigned flags); -bool io_alloc_async_data(struct io_kiocb *req); +void io_req_task_work_add_remote(struct io_kiocb *req, unsigned flags); void io_req_task_queue(struct io_kiocb *req); -void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_complete(struct io_kiocb *req, io_tw_token_t tw); void io_req_task_queue_fail(struct io_kiocb *req, int ret); -void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_task_submit(struct io_kiocb *req, io_tw_token_t tw); struct llist_node *io_handle_tw_list(struct llist_node *node, unsigned int *count, unsigned int max_entries); struct llist_node *tctx_task_work_run(struct io_uring_task *tctx, unsigned int max_entries, unsigned int *count); void tctx_task_work(struct callback_head *cb); __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx); int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int start, int end); void io_req_queue_iowq(struct io_kiocb *req); -int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); +int io_poll_issue(struct io_kiocb *req, io_tw_token_t tw); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin); void __io_submit_flush_completions(struct io_ring_ctx *ctx); @@ -109,7 +116,7 @@ void io_queue_next(struct io_kiocb *req); void io_task_refs_refill(struct io_uring_task *tctx); bool __io_alloc_req_refill(struct io_ring_ctx *ctx); -bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task, +bool io_match_task_safe(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all); void io_activate_pollwq(struct io_ring_ctx *ctx); @@ -119,6 +126,9 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) #if defined(CONFIG_PROVE_LOCKING) lockdep_assert(in_task()); + if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) + lockdep_assert_held(&ctx->uring_lock); + if (ctx->flags & IORING_SETUP_IOPOLL) { lockdep_assert_held(&ctx->uring_lock); } else if (!ctx->task_complete) { @@ -130,14 +140,17 @@ static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx) * Not from an SQE, as those cannot be submitted, but via * updating tagged resources. */ - if (ctx->submitter_task->flags & PF_EXITING) - lockdep_assert(current_work()); - else + if (!percpu_ref_is_dying(&ctx->refs)) lockdep_assert(current == ctx->submitter_task); } #endif } +static inline bool io_is_compat(struct io_ring_ctx *ctx) +{ + return IS_ENABLED(CONFIG_COMPAT) && unlikely(ctx->compat); +} + static inline void io_req_task_work_add(struct io_kiocb *req) { __io_req_task_work_add(req, 0); @@ -176,6 +189,15 @@ static inline bool io_get_cqe(struct io_ring_ctx *ctx, struct io_uring_cqe **ret return io_get_cqe_overflow(ctx, ret, false); } +static inline bool io_defer_get_uncommited_cqe(struct io_ring_ctx *ctx, + struct io_uring_cqe **cqe_ret) +{ + io_lockdep_assert_cq_locked(ctx); + + ctx->submit_state.cq_flush = true; + return io_get_cqe(ctx, cqe_ret); +} + static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { @@ -189,16 +211,15 @@ static __always_inline bool io_fill_cqe_req(struct io_ring_ctx *ctx, if (unlikely(!io_get_cqe(ctx, &cqe))) return false; - if (trace_io_uring_complete_enabled()) - trace_io_uring_complete(req->ctx, req, req->cqe.user_data, - req->cqe.res, req->cqe.flags, - req->big_cqe.extra1, req->big_cqe.extra2); memcpy(cqe, &req->cqe, sizeof(*cqe)); if (ctx->flags & IORING_SETUP_CQE32) { memcpy(cqe->big_cqe, &req->big_cqe, sizeof(*cqe)); memset(&req->big_cqe, 0, sizeof(req->big_cqe)); } + + if (trace_io_uring_complete_enabled()) + trace_io_uring_complete(req->ctx, req, cqe); return true; } @@ -217,6 +238,22 @@ static inline void io_req_set_res(struct io_kiocb *req, s32 res, u32 cflags) req->cqe.flags = cflags; } +static inline void *io_uring_alloc_async_data(struct io_alloc_cache *cache, + struct io_kiocb *req) +{ + if (cache) { + req->async_data = io_cache_alloc(cache, GFP_KERNEL); + } else { + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + + WARN_ON_ONCE(!def->async_size); + req->async_data = kmalloc(def->async_size, GFP_KERNEL); + } + if (req->async_data) + req->flags |= REQ_F_ASYNC_DATA; + return req->async_data; +} + static inline bool req_has_async_data(struct io_kiocb *req) { return req->flags & REQ_F_ASYNC_DATA; @@ -342,12 +379,17 @@ static inline int io_run_task_work(void) return ret; } +static inline bool io_local_work_pending(struct io_ring_ctx *ctx) +{ + return !llist_empty(&ctx->work_llist) || !llist_empty(&ctx->retry_llist); +} + static inline bool io_task_work_pending(struct io_ring_ctx *ctx) { - return task_work_pending(current) || !llist_empty(&ctx->work_llist); + return task_work_pending(current) || io_local_work_pending(ctx); } -static inline void io_tw_lock(struct io_ring_ctx *ctx, struct io_tw_state *ts) +static inline void io_tw_lock(struct io_ring_ctx *ctx, io_tw_token_t tw) { lockdep_assert_held(&ctx->uring_lock); } @@ -369,7 +411,7 @@ static inline void io_req_complete_defer(struct io_kiocb *req) static inline void io_commit_cqring_flush(struct io_ring_ctx *ctx) { - if (unlikely(ctx->off_timeout_used || ctx->drain_active || + if (unlikely(ctx->off_timeout_used || ctx->has_evfd || ctx->poll_activated)) __io_commit_cqring_flush(ctx); } @@ -389,7 +431,6 @@ static inline bool io_req_cache_empty(struct io_ring_ctx *ctx) } extern struct kmem_cache *req_cachep; -extern struct kmem_cache *io_buf_cachep; static inline struct io_kiocb *io_extract_req(struct io_ring_ctx *ctx) { @@ -421,6 +462,19 @@ static inline bool io_allowed_run_tw(struct io_ring_ctx *ctx) ctx->submitter_task == current); } +/* + * Terminate the request if either of these conditions are true: + * + * 1) It's being executed by the original task, but that task is marked + * with PF_EXITING as it's exiting. + * 2) PF_KTHREAD is set, in which case the invoker of the task_work is + * our fallback task_work. + */ +static inline bool io_should_terminate_tw(void) +{ + return current->flags & (PF_KTHREAD | PF_EXITING); +} + static inline void io_req_queue_tw_complete(struct io_kiocb *req, s32 res) { io_req_set_res(req, res, 0); @@ -466,6 +520,6 @@ enum { static inline bool io_has_work(struct io_ring_ctx *ctx) { return test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq) || - !llist_empty(&ctx->work_llist); + io_local_work_pending(ctx); } #endif diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..f2d2cc319faa 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -20,7 +20,8 @@ /* BIDs are addressed by a 16-bit field in a CQE */ #define MAX_BIDS_PER_BGID (1 << 16) -struct kmem_cache *io_buf_cachep; +/* Mapped buffer ring, return io_uring_buf from head */ +#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] struct io_provide_buf { struct file *file; @@ -31,6 +32,41 @@ struct io_provide_buf { __u16 bid; }; +static bool io_kbuf_inc_commit(struct io_buffer_list *bl, int len) +{ + while (len) { + struct io_uring_buf *buf; + u32 this_len; + + buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); + this_len = min_t(int, len, buf->len); + buf->len -= this_len; + if (buf->len) { + buf->addr += this_len; + return false; + } + bl->head++; + len -= this_len; + } + return true; +} + +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr) +{ + if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) + return true; + + req->flags &= ~REQ_F_BUFFERS_COMMIT; + + if (unlikely(len < 0)) + return true; + if (bl->flags & IOBL_INC) + return io_kbuf_inc_commit(bl, len); + bl->head += nr; + return true; +} + static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -45,13 +81,22 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } +void io_kbuf_drop_legacy(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!(req->flags & REQ_F_BUFFER_SELECTED))) + return; + req->flags &= ~REQ_F_BUFFER_SELECTED; + kfree(req->kbuf); + req->kbuf = NULL; +} + bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) { struct io_ring_ctx *ctx = req->ctx; @@ -63,40 +108,13 @@ bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags) buf = req->kbuf; bl = io_buffer_get_list(ctx, buf->bgid); list_add(&buf->list, &bl->buf_list); + bl->nbufs++; req->flags &= ~REQ_F_BUFFER_SELECTED; - req->buf_index = buf->bgid; io_ring_submit_unlock(ctx, issue_flags); return true; } -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) -{ - /* - * We can add this buffer back to two lists: - * - * 1) The io_buffers_cache list. This one is protected by the - * ctx->uring_lock. If we already hold this lock, add back to this - * list as we can grab it from issue as well. - * 2) The io_buffers_comp list. This one is protected by the - * ctx->completion_lock. - * - * We migrate buffers from the comp_list to the issue cache list - * when we need one. - */ - if (issue_flags & IO_URING_F_UNLOCKED) { - struct io_ring_ctx *ctx = req->ctx; - - spin_lock(&ctx->completion_lock); - __io_put_kbuf_list(req, len, &ctx->io_buffers_comp); - spin_unlock(&ctx->completion_lock); - } else { - lockdep_assert_held(&req->ctx->uring_lock); - - __io_put_kbuf_list(req, len, &req->ctx->io_buffers_cache); - } -} - static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl) { @@ -105,6 +123,7 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, kbuf = list_first_entry(&bl->buf_list, struct io_buffer, list); list_del(&kbuf->list); + bl->nbufs--; if (*len == 0 || *len > kbuf->len) *len = kbuf->len; if (list_empty(&bl->buf_list)) @@ -139,6 +158,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_uring_buf_ring *br = bl->buf_ring; __u16 tail, head = bl->head; struct io_uring_buf *buf; + void __user *ret; tail = smp_load_acquire(&br->tail); if (unlikely(tail == head)) @@ -153,6 +173,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT; req->buf_list = bl; req->buf_index = buf->bid; + ret = u64_to_user_ptr(buf->addr); if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) { /* @@ -168,11 +189,11 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, io_kbuf_commit(req, bl, *len, 1); req->buf_list = NULL; } - return u64_to_user_ptr(buf->addr); + return ret; } void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags) + unsigned buf_group, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; struct io_buffer_list *bl; @@ -180,7 +201,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, io_ring_submit_lock(req->ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, buf_group); if (likely(bl)) { if (bl->flags & IOBL_BUF_RING) ret = io_ring_buffer_select(req, len, bl, issue_flags); @@ -212,25 +233,14 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, buf = io_ring_head_to_buf(br, head, bl->mask); if (arg->max_len) { u32 len = READ_ONCE(buf->len); + size_t needed; if (unlikely(!len)) return -ENOBUFS; - /* - * Limit incremental buffers to 1 segment. No point trying - * to peek ahead and map more than we need, when the buffers - * themselves should be large when setup with - * IOU_PBUF_RING_INC. - */ - if (bl->flags & IOBL_INC) { - nr_avail = 1; - } else { - size_t needed; - - needed = (arg->max_len + len - 1) / len; - needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); - if (nr_avail > needed) - nr_avail = needed; - } + needed = (arg->max_len + len - 1) / len; + needed = min_not_zero(needed, (size_t) PEEK_MAX_IMPORT); + if (nr_avail > needed) + nr_avail = needed; } /* @@ -260,8 +270,12 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg, /* truncate end piece, if needed, for non partial buffers */ if (len > arg->max_len) { len = arg->max_len; - if (!(bl->flags & IOBL_INC)) + if (!(bl->flags & IOBL_INC)) { + arg->partial_map = 1; + if (iov != arg->iovs) + break; buf->len = len; + } } iov->iov_base = u64_to_user_ptr(buf->addr); @@ -292,7 +306,7 @@ int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, int ret = -ENOENT; io_ring_submit_lock(ctx, issue_flags); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) goto out_unlock; @@ -325,7 +339,7 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) lockdep_assert_held(&ctx->uring_lock); - bl = io_buffer_get_list(ctx, req->buf_index); + bl = io_buffer_get_list(ctx, arg->buf_group); if (unlikely(!bl)) return -ENOENT; @@ -340,82 +354,88 @@ int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg) return io_provided_buffers_select(req, &arg->max_len, bl, arg->iovs); } -static int __io_remove_buffers(struct io_ring_ctx *ctx, - struct io_buffer_list *bl, unsigned nbufs) +static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) { - unsigned i = 0; + struct io_buffer_list *bl = req->buf_list; + bool ret = true; - /* shouldn't happen */ - if (!nbufs) - return 0; + if (bl) + ret = io_kbuf_commit(req, bl, len, nr); - if (bl->flags & IOBL_BUF_RING) { - i = bl->buf_ring->tail - bl->head; - if (bl->buf_nr_pages) { - int j; + req->flags &= ~REQ_F_BUFFER_RING; + return ret; +} - if (!(bl->flags & IOBL_MMAP)) { - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - } - io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); - bl->flags &= ~IOBL_MMAP; - } - /* make sure it's seen as empty */ - INIT_LIST_HEAD(&bl->buf_list); - bl->flags &= ~IOBL_BUF_RING; - return i; +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs) +{ + unsigned int ret; + + ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); + + if (unlikely(!(req->flags & REQ_F_BUFFER_RING))) { + io_kbuf_drop_legacy(req); + return ret; } + if (!__io_put_kbuf_ring(req, len, nbufs)) + ret |= IORING_CQE_F_BUF_MORE; + return ret; +} + +static int io_remove_buffers_legacy(struct io_ring_ctx *ctx, + struct io_buffer_list *bl, + unsigned long nbufs) +{ + unsigned long i = 0; + struct io_buffer *nxt; + /* protects io_buffers_cache */ lockdep_assert_held(&ctx->uring_lock); + WARN_ON_ONCE(bl->flags & IOBL_BUF_RING); - while (!list_empty(&bl->buf_list)) { - struct io_buffer *nxt; - + for (i = 0; i < nbufs && !list_empty(&bl->buf_list); i++) { nxt = list_first_entry(&bl->buf_list, struct io_buffer, list); - list_move(&nxt->list, &ctx->io_buffers_cache); - if (++i == nbufs) - return i; + list_del(&nxt->list); + bl->nbufs--; + kfree(nxt); cond_resched(); } - return i; } -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (atomic_dec_and_test(&bl->refs)) { - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); - } + if (bl->flags & IOBL_BUF_RING) + io_free_region(ctx, &bl->region); + else + io_remove_buffers_legacy(ctx, bl, -1U); + + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) { struct io_buffer_list *bl; - struct list_head *item, *tmp; - struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } +} - /* - * Move deferred locked entries to cache before pruning - */ - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) - list_splice_init(&ctx->io_buffers_comp, &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - - list_for_each_safe(item, tmp, &ctx->io_buffers_cache) { - buf = list_entry(item, struct io_buffer, list); - kmem_cache_free(io_buf_cachep, buf); - } +static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +{ + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); + io_put_bl(ctx, bl); } int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -437,30 +457,6 @@ int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) -{ - struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); - - ret = -ENOENT; - bl = io_buffer_get_list(ctx, p->bgid); - if (bl) { - ret = -EINVAL; - /* can't use provide/remove buffers command on mapped buffers */ - if (!(bl->flags & IOBL_BUF_RING)) - ret = __io_remove_buffers(ctx, bl, p->nbufs); - } - io_ring_submit_unlock(ctx, issue_flags); - if (ret < 0) - req_set_fail(req); - io_req_set_res(req, ret, 0); - return IOU_OK; -} - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { unsigned long size, tmp_check; @@ -476,14 +472,14 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); + if (!p->len) + return -EINVAL; if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, &size)) return -EOVERFLOW; if (check_add_overflow((unsigned long)p->addr, size, &tmp_check)) return -EOVERFLOW; - - size = (unsigned long)p->len * p->nbufs; if (!access_ok(u64_to_user_ptr(p->addr), size)) return -EFAULT; @@ -497,67 +493,29 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe return 0; } -#define IO_BUFFER_ALLOC_BATCH 64 - -static int io_refill_buffer_cache(struct io_ring_ctx *ctx) -{ - struct io_buffer *bufs[IO_BUFFER_ALLOC_BATCH]; - int allocated; - - /* - * Completions that don't happen inline (eg not under uring_lock) will - * add to ->io_buffers_comp. If we don't have any free buffers, check - * the completion list and splice those entries first. - */ - if (!list_empty_careful(&ctx->io_buffers_comp)) { - spin_lock(&ctx->completion_lock); - if (!list_empty(&ctx->io_buffers_comp)) { - list_splice_init(&ctx->io_buffers_comp, - &ctx->io_buffers_cache); - spin_unlock(&ctx->completion_lock); - return 0; - } - spin_unlock(&ctx->completion_lock); - } - - /* - * No free buffers and no completion entries either. Allocate a new - * batch of buffer entries and add those to our freelist. - */ - - allocated = kmem_cache_alloc_bulk(io_buf_cachep, GFP_KERNEL_ACCOUNT, - ARRAY_SIZE(bufs), (void **) bufs); - if (unlikely(!allocated)) { - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - bufs[0] = kmem_cache_alloc(io_buf_cachep, GFP_KERNEL); - if (!bufs[0]) - return -ENOMEM; - allocated = 1; - } - - while (allocated) - list_add_tail(&bufs[--allocated]->list, &ctx->io_buffers_cache); - - return 0; -} - static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, struct io_buffer_list *bl) { struct io_buffer *buf; u64 addr = pbuf->addr; - int i, bid = pbuf->bid; + int ret = -ENOMEM, i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - if (list_empty(&ctx->io_buffers_cache) && - io_refill_buffer_cache(ctx)) + /* + * Nonsensical to have more than sizeof(bid) buffers in a + * buffer list, as the application then has no way of knowing + * which duplicate bid refers to what buffer. + */ + if (bl->nbufs == USHRT_MAX) { + ret = -EOVERFLOW; break; - buf = list_first_entry(&ctx->io_buffers_cache, struct io_buffer, - list); - list_move_tail(&buf->list, &bl->buf_list); + } + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) + break; + + list_add_tail(&buf->list, &bl->buf_list); + bl->nbufs++; buf->addr = addr; buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT); buf->bid = bid; @@ -567,145 +525,75 @@ static int io_add_buffers(struct io_ring_ctx *ctx, struct io_provide_buf *pbuf, cond_resched(); } - return i ? 0 : -ENOMEM; + return i ? 0 : ret; } -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) +static int __io_manage_buffers_legacy(struct io_kiocb *req, + struct io_buffer_list *bl) { struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); - struct io_ring_ctx *ctx = req->ctx; - struct io_buffer_list *bl; - int ret = 0; - - io_ring_submit_lock(ctx, issue_flags); + int ret; - bl = io_buffer_get_list(ctx, p->bgid); - if (unlikely(!bl)) { + if (!bl) { + if (req->opcode != IORING_OP_PROVIDE_BUFFERS) + return -ENOENT; bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); - if (!bl) { - ret = -ENOMEM; - goto err; - } + if (!bl) + return -ENOMEM; + INIT_LIST_HEAD(&bl->buf_list); - ret = io_buffer_add_list(ctx, bl, p->bgid); + ret = io_buffer_add_list(req->ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); - goto err; + kfree(bl); + return ret; } } - /* can't add buffers via this command for a mapped buffer ring */ - if (bl->flags & IOBL_BUF_RING) { - ret = -EINVAL; - goto err; - } + /* can't use provide/remove buffers command on mapped buffers */ + if (bl->flags & IOBL_BUF_RING) + return -EINVAL; + if (req->opcode == IORING_OP_PROVIDE_BUFFERS) + return io_add_buffers(req->ctx, p, bl); + return io_remove_buffers_legacy(req->ctx, bl, p->nbufs); +} + +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf); + struct io_ring_ctx *ctx = req->ctx; + struct io_buffer_list *bl; + int ret; - ret = io_add_buffers(ctx, p, bl); -err: + io_ring_submit_lock(ctx, issue_flags); + bl = io_buffer_get_list(ctx, p->bgid); + ret = __io_manage_buffers_legacy(req, bl); io_ring_submit_unlock(ctx, issue_flags); if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; -} - -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br = NULL; - struct page **pages; - int nr_pages, ret; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!br) { - ret = -ENOMEM; - goto error_unpin; - } - -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - ret = -EINVAL; - goto error_unpin; - } -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->flags |= IOBL_BUF_RING; - bl->flags &= ~IOBL_MMAP; - return 0; -error_unpin: - unpin_user_pages(pages, nr_pages); - kvfree(pages); - vunmap(br); - return ret; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - size_t ring_size; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (IS_ERR(bl->buf_ring)) { - bl->buf_ring = NULL; - return -ENOMEM; - } - - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); - return 0; + return IOU_COMPLETE; } int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; - struct io_buffer_list *bl, *free_bl = NULL; + struct io_buffer_list *bl; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) + if (!mem_is_zero(reg.resv, sizeof(reg.resv))) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; @@ -715,28 +603,55 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) /* if mapped buffer ring OR classic exists, don't allow */ if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list)) return -EEXIST; - } else { - free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL); - if (!bl) - return -ENOMEM; + io_destroy_bl(ctx, bl); } - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + bl = kzalloc(sizeof(*bl), GFP_KERNEL_ACCOUNT); + if (!bl) + return -ENOMEM; - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - if (reg.flags & IOU_PBUF_RING_INC) - bl->flags |= IOBL_INC; + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); - kfree_rcu(free_bl, rcu); +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; + } +#endif + + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); + kfree(bl); return ret; } @@ -749,9 +664,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (reg.resv[0] || reg.resv[1] || reg.resv[2]) - return -EINVAL; - if (reg.flags) + if (!mem_is_zero(reg.resv, sizeof(reg.resv)) || reg.flags) return -EINVAL; bl = io_buffer_get_list(ctx, reg.bgid); @@ -760,7 +673,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -769,14 +684,11 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_status buf_status; struct io_buffer_list *bl; - int i; if (copy_from_user(&buf_status, arg, sizeof(buf_status))) return -EFAULT; - - for (i = 0; i < ARRAY_SIZE(buf_status.resv); i++) - if (buf_status.resv[i]) - return -EINVAL; + if (!mem_is_zero(buf_status.resv, sizeof(buf_status.resv))) + return -EINVAL; bl = io_buffer_get_list(ctx, buf_status.buf_group); if (!bl) @@ -791,50 +703,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid) -{ - struct io_buffer_list *bl; - bool ret; - - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); - bl = xa_load(&ctx->io_bl_xa, bgid); - /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; - - return ERR_PTR(-EINVAL); -} - -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { - struct io_ring_ctx *ctx = file->private_data; - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; struct io_buffer_list *bl; - int bgid, ret; - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return PTR_ERR(bl); + lockdep_assert_held(&ctx->mmap_lock); - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); - io_put_bl(ctx, bl); - return ret; + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) + return NULL; + return &bl->region; } diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..723d0361898e 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -3,15 +3,13 @@ #define IOU_KBUF_H #include <uapi/linux/io_uring.h> +#include <linux/io_uring_types.h> enum { /* ring mapped provided buffers */ IOBL_BUF_RING = 1, - /* ring mapped provided buffers, but mmap'ed by application */ - IOBL_MMAP = 2, /* buffers are consumed incrementally rather than always fully */ - IOBL_INC = 4, - + IOBL_INC = 2, }; struct io_buffer_list { @@ -21,12 +19,11 @@ struct io_buffer_list { */ union { struct list_head buf_list; - struct { - struct page **buf_pages; - struct io_uring_buf_ring *buf_ring; - }; - struct rcu_head rcu; + struct io_uring_buf_ring *buf_ring; }; + /* count of classic/legacy buffers in buffer list */ + int nbufs; + __u16 bgid; /* below is for ring provided buffers */ @@ -37,7 +34,7 @@ struct io_buffer_list { __u16 flags; - atomic_t refs; + struct io_mapped_region region; }; struct io_buffer { @@ -61,33 +58,34 @@ struct buf_sel_arg { size_t max_len; unsigned short nr_iovs; unsigned short mode; + unsigned short buf_group; + unsigned short partial_map; }; void __user *io_buffer_select(struct io_kiocb *req, size_t *len, - unsigned int issue_flags); + unsigned buf_group, unsigned int issue_flags); int io_buffers_select(struct io_kiocb *req, struct buf_sel_arg *arg, unsigned int issue_flags); int io_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg); void io_destroy_buffers(struct io_ring_ctx *ctx); int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags); - int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); -int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags); +int io_manage_buffers_legacy(struct io_kiocb *req, unsigned int issue_flags); int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg); int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg); -void __io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags); - bool io_kbuf_recycle_legacy(struct io_kiocb *req, unsigned issue_flags); +void io_kbuf_drop_legacy(struct io_kiocb *req); + +unsigned int __io_put_kbufs(struct io_kiocb *req, int len, int nbufs); +bool io_kbuf_commit(struct io_kiocb *req, + struct io_buffer_list *bl, int len, int nr); -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl); -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid); -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma); +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid); static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) { @@ -99,7 +97,6 @@ static inline bool io_kbuf_recycle_ring(struct io_kiocb *req) * to monopolize the buffer. */ if (req->buf_list) { - req->buf_index = req->buf_list->bgid; req->flags &= ~(REQ_F_BUFFER_RING|REQ_F_BUFFERS_COMMIT); return true; } @@ -124,100 +121,19 @@ static inline bool io_kbuf_recycle(struct io_kiocb *req, unsigned issue_flags) return false; } -/* Mapped buffer ring, return io_uring_buf from head */ -#define io_ring_head_to_buf(br, head, mask) &(br)->bufs[(head) & (mask)] - -static inline bool io_kbuf_commit(struct io_kiocb *req, - struct io_buffer_list *bl, int len, int nr) -{ - if (unlikely(!(req->flags & REQ_F_BUFFERS_COMMIT))) - return true; - - req->flags &= ~REQ_F_BUFFERS_COMMIT; - - if (unlikely(len < 0)) - return true; - - if (bl->flags & IOBL_INC) { - struct io_uring_buf *buf; - - buf = io_ring_head_to_buf(bl->buf_ring, bl->head, bl->mask); - if (WARN_ON_ONCE(len > buf->len)) - len = buf->len; - buf->len -= len; - if (buf->len) { - buf->addr += len; - return false; - } - } - - bl->head += nr; - return true; -} - -static inline bool __io_put_kbuf_ring(struct io_kiocb *req, int len, int nr) -{ - struct io_buffer_list *bl = req->buf_list; - bool ret = true; - - if (bl) { - ret = io_kbuf_commit(req, bl, len, nr); - req->buf_index = bl->bgid; - } - req->flags &= ~REQ_F_BUFFER_RING; - return ret; -} - -static inline void __io_put_kbuf_list(struct io_kiocb *req, int len, - struct list_head *list) -{ - if (req->flags & REQ_F_BUFFER_RING) { - __io_put_kbuf_ring(req, len, 1); - } else { - req->buf_index = req->kbuf->bgid; - list_add(&req->kbuf->list, list); - req->flags &= ~REQ_F_BUFFER_SELECTED; - } -} - -static inline void io_kbuf_drop(struct io_kiocb *req) -{ - lockdep_assert_held(&req->ctx->completion_lock); - - if (!(req->flags & (REQ_F_BUFFER_SELECTED|REQ_F_BUFFER_RING))) - return; - - /* len == 0 is fine here, non-ring will always drop all of it */ - __io_put_kbuf_list(req, 0, &req->ctx->io_buffers_comp); -} - -static inline unsigned int __io_put_kbufs(struct io_kiocb *req, int len, - int nbufs, unsigned issue_flags) -{ - unsigned int ret; - - if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) - return 0; - - ret = IORING_CQE_F_BUFFER | (req->buf_index << IORING_CQE_BUFFER_SHIFT); - if (req->flags & REQ_F_BUFFER_RING) { - if (!__io_put_kbuf_ring(req, len, nbufs)) - ret |= IORING_CQE_F_BUF_MORE; - } else { - __io_put_kbuf(req, len, issue_flags); - } - return ret; -} - static inline unsigned int io_put_kbuf(struct io_kiocb *req, int len, unsigned issue_flags) { - return __io_put_kbufs(req, len, 1, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, 1); } static inline unsigned int io_put_kbufs(struct io_kiocb *req, int len, int nbufs, unsigned issue_flags) { - return __io_put_kbufs(req, len, nbufs, issue_flags); + if (!(req->flags & (REQ_F_BUFFER_RING | REQ_F_BUFFER_SELECTED))) + return 0; + return __io_put_kbufs(req, len, nbufs); } #endif diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0f32a255fd1..725dc0bec24c 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -12,6 +12,8 @@ #include "memmap.h" #include "kbuf.h" +#include "rsrc.h" +#include "zcrx.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) @@ -35,111 +37,24 @@ static void *io_mem_alloc_compound(struct page **pages, int nr_pages, return page_address(page); } -static void *io_mem_alloc_single(struct page **pages, int nr_pages, size_t size, - gfp_t gfp) -{ - void *ret; - int i; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(gfp); - if (!pages[i]) - goto err; - } - - ret = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (ret) - return ret; -err: - while (i--) - put_page(pages[i]); - return ERR_PTR(-ENOMEM); -} - -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size) -{ - gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; - struct page **pages; - int nr_pages; - void *ret; - - nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - pages = kvmalloc_array(nr_pages, sizeof(struct page *), gfp); - if (!pages) - return ERR_PTR(-ENOMEM); - - ret = io_mem_alloc_compound(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) - goto done; - - ret = io_mem_alloc_single(pages, nr_pages, size, gfp); - if (!IS_ERR(ret)) { -done: - *out_pages = pages; - *npages = nr_pages; - return ret; - } - - kvfree(pages); - *out_pages = NULL; - *npages = 0; - return ret; -} - -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages) -{ - bool do_vunmap = false; - - if (!ptr) - return; - - if (put_pages && *npages) { - struct page **to_free = *pages; - int i; - - /* - * Only did vmap for the non-compound multiple page case. - * For the compound page, we just need to put the head. - */ - if (PageCompound(to_free[0])) - *npages = 1; - else if (*npages > 1) - do_vunmap = true; - for (i = 0; i < *npages; i++) - put_page(to_free[i]); - } - if (do_vunmap) - vunmap(ptr); - kvfree(*pages); - *pages = NULL; - *npages = 0; -} - -void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages; int ret; - end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (check_add_overflow(uaddr, len, &end)) + return ERR_PTR(-EOVERFLOW); + if (check_add_overflow(end, PAGE_SIZE - 1, &end)) + return ERR_PTR(-EOVERFLOW); + + end = end >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); + if (WARN_ON_ONCE(nr_pages > INT_MAX)) + return ERR_PTR(-EOVERFLOW); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) @@ -164,89 +79,256 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) return ERR_PTR(ret); } -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, + /* memory is provided by user and pinned by the kernel */ + IO_REGION_F_USER_PROVIDED = 2, + /* only the first page in the array is ref'ed */ + IO_REGION_F_SINGLE_REF = 4, +}; + +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { - struct page **page_array; - unsigned int nr_pages; - void *page_addr; + if (mr->pages) { + long nr_refs = mr->nr_pages; - *npages = 0; + if (mr->flags & IO_REGION_F_SINGLE_REF) + nr_refs = 1; - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + unpin_user_pages(mr->pages, nr_refs); + else + release_pages(mr->pages, nr_refs); - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; + kvfree(mr->pages); + } + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); + if (mr->nr_pages && ctx->user) + __io_unaccount_mem(ctx->user, mr->nr_pages); - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; + memset(mr, 0, sizeof(*mr)); +} + +static int io_region_init_ptr(struct io_mapped_region *mr) +{ + struct io_imu_folio_data ifd; + void *ptr; + + if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { + if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { + mr->ptr = page_address(mr->pages[0]); + return 0; + } } + ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); + if (!ptr) + return -ENOMEM; - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); + mr->ptr = ptr; + mr->flags |= IO_REGION_F_VMAP; + return 0; } -static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, - size_t sz) +static int io_region_pin_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg) +{ + unsigned long size = mr->nr_pages << PAGE_SHIFT; + struct page **pages; + int nr_pages; + + pages = io_pin_pages(reg->user_addr, size, &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) + return -EFAULT; + + mr->pages = pages; + mr->flags |= IO_REGION_F_USER_PROVIDED; + return 0; +} + +static int io_region_allocate_pages(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; + unsigned long size = mr->nr_pages << PAGE_SHIFT; + unsigned long nr_allocated; + struct page **pages; + void *p; + + pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); + if (!pages) + return -ENOMEM; + + p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); + if (!IS_ERR(p)) { + mr->flags |= IO_REGION_F_SINGLE_REF; + goto done; + } + + nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, + mr->nr_pages, pages); + if (nr_allocated != mr->nr_pages) { + if (nr_allocated) + release_pages(pages, nr_allocated); + kvfree(pages); + return -ENOMEM; + } +done: + reg->mmap_offset = mmap_offset; + mr->pages = pages; + return 0; +} + +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + int nr_pages, ret; + u64 end; + + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) + return -EFAULT; + if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) + return -EINVAL; + if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) + return -EINVAL; + /* user_addr should be set IFF it's a user memory backed region */ + if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) + return -EFAULT; + if (!reg->size || reg->mmap_offset || reg->id) + return -EINVAL; + if ((reg->size >> PAGE_SHIFT) > INT_MAX) + return -E2BIG; + if ((reg->user_addr | reg->size) & ~PAGE_MASK) + return -EINVAL; + if (check_add_overflow(reg->user_addr, reg->size, &end)) + return -EOVERFLOW; + + nr_pages = reg->size >> PAGE_SHIFT; + if (ctx->user) { + ret = __io_account_mem(ctx->user, nr_pages); + if (ret) + return ret; + } + mr->nr_pages = nr_pages; + + if (reg->flags & IORING_MEM_REGION_TYPE_USER) + ret = io_region_pin_pages(ctx, mr, reg); + else + ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); + if (ret) + goto out_free; + + ret = io_region_init_ptr(mr); + if (ret) + goto out_free; + return 0; +out_free: + io_free_region(ctx, mr); + return ret; +} + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset) +{ + struct io_mapped_region tmp_mr; + int ret; + + memcpy(&tmp_mr, mr, sizeof(tmp_mr)); + ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); + if (ret) + return ret; + + /* + * Once published mmap can find it without holding only the ->mmap_lock + * and not ->uring_lock. + */ + guard(mutex)(&ctx->mmap_lock); + memcpy(mr, &tmp_mr, sizeof(tmp_mr)); + return 0; +} + +static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, + loff_t pgoff) { - struct io_ring_ctx *ctx = file->private_data; loff_t offset = pgoff << PAGE_SHIFT; + unsigned int id; - switch ((pgoff << PAGE_SHIFT) & IORING_OFF_MMAP_MASK) { + + switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - return ctx->rings; + return &ctx->ring_region; case IORING_OFF_SQES: - /* Don't allow mmap if the ring was setup without it */ - if (ctx->flags & IORING_SETUP_NO_MMAP) - return ERR_PTR(-EINVAL); - return ctx->sq_sqes; - case IORING_OFF_PBUF_RING: { - struct io_buffer_list *bl; - unsigned int bgid; - void *ptr; - - bgid = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return bl; - ptr = bl->buf_ring; - io_put_bl(ctx, bl); - return ptr; - } + return &ctx->sq_region; + case IORING_OFF_PBUF_RING: + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; + return io_pbuf_get_region(ctx, id); + case IORING_MAP_OFF_PARAM_REGION: + return &ctx->param_region; + case IORING_MAP_OFF_ZCRX_REGION: + id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; + return io_zcrx_get_region(ctx, id); } + return NULL; +} + +static void *io_region_validate_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr) +{ + lockdep_assert_held(&ctx->mmap_lock); + + if (!io_region_is_set(mr)) + return ERR_PTR(-EINVAL); + if (mr->flags & IO_REGION_F_USER_PROVIDED) + return ERR_PTR(-EINVAL); - return ERR_PTR(-EINVAL); + return io_region_get_ptr(mr); } -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages) +static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, + size_t sz) { - unsigned long nr_pages = npages; + struct io_ring_ctx *ctx = file->private_data; + struct io_mapped_region *region; - vm_flags_set(vma, VM_DONTEXPAND); - return vm_insert_pages(vma, vma->vm_start, pages, &nr_pages); + region = io_mmap_get_region(ctx, pgoff); + if (!region) + return ERR_PTR(-EINVAL); + return io_region_validate_mmap(ctx, region); } #ifdef CONFIG_MMU +static int io_region_mmap(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct vm_area_struct *vma, + unsigned max_pages) +{ + unsigned long nr_pages = min(mr->nr_pages, max_pages); + + vm_flags_set(vma, VM_DONTEXPAND); + return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); +} + __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int npages; + unsigned int page_limit = UINT_MAX; + struct io_mapped_region *region; void *ptr; + guard(mutex)(&ctx->mmap_lock); + ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); @@ -254,22 +336,19 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); - case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); - case IORING_OFF_PBUF_RING: - return io_pbuf_mmap(file, vma); + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + break; } - return -EINVAL; + region = io_mmap_get_region(ctx, vma->vm_pgoff); + return io_region_mmap(ctx, region, vma, page_limit); } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = filp->private_data; void *ptr; /* @@ -280,6 +359,8 @@ unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) return -EINVAL; + guard(mutex)(&ctx->mmap_lock); + ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; @@ -325,8 +406,11 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct io_ring_ctx *ctx = file->private_data; void *ptr; + guard(mutex)(&ctx->mmap_lock); + ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 5cec5b7ac49a..08419684e4bc 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -1,18 +1,12 @@ #ifndef IO_URING_MEMMAP_H #define IO_URING_MEMMAP_H -struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -void io_pages_free(struct page ***pages, int npages); -int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, - struct page **pages, int npages); +#define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL +#define IORING_MAP_OFF_ZCRX_REGION 0x30000000ULL -void *io_pages_map(struct page ***out_pages, unsigned short *npages, - size_t size); -void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, - bool put_pages); +#define IORING_OFF_ZCRX_SHIFT 16 -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size); +struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages); #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); @@ -22,4 +16,24 @@ unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long flags); int io_uring_mmap(struct file *file, struct vm_area_struct *vma); +void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr); +int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +int io_create_region_mmap_safe(struct io_ring_ctx *ctx, + struct io_mapped_region *mr, + struct io_uring_region_desc *reg, + unsigned long mmap_offset); + +static inline void *io_region_get_ptr(struct io_mapped_region *mr) +{ + return mr->ptr; +} + +static inline bool io_region_is_set(struct io_mapped_region *mr) +{ + return !!mr->nr_pages; +} + #endif diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c index 7fd9badcfaf8..71400d6cefc8 100644 --- a/io_uring/msg_ring.c +++ b/io_uring/msg_ring.c @@ -38,8 +38,8 @@ static void io_double_unlock_ctx(struct io_ring_ctx *octx) mutex_unlock(&octx->uring_lock); } -static int io_double_lock_ctx(struct io_ring_ctx *octx, - unsigned int issue_flags) +static int io_lock_external_ctx(struct io_ring_ctx *octx, + unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only @@ -71,7 +71,7 @@ static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) return target_ctx->task_complete; } -static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_ring_ctx *ctx = req->ctx; @@ -89,17 +89,18 @@ static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { - req->task = READ_ONCE(ctx->submitter_task); - if (!req->task) { + if (!READ_ONCE(ctx->submitter_task)) { kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } + req->opcode = IORING_OP_NOP; req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; + req->tctx = NULL; req->io_task_work.func = io_msg_tw_complete; - io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); + io_req_task_work_add_remote(req, IOU_F_TWQ_LAZY_WAKE); return 0; } @@ -116,14 +117,13 @@ static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } -static int io_msg_data_remote(struct io_kiocb *req) +static int io_msg_data_remote(struct io_ring_ctx *target_ctx, + struct io_msg *msg) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_kiocb *target; u32 flags = 0; - target = io_msg_get_kiocb(req->ctx); + target = io_msg_get_kiocb(target_ctx); if (unlikely(!target)) return -ENOMEM; @@ -134,10 +134,9 @@ static int io_msg_data_remote(struct io_kiocb *req) msg->user_data); } -static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +static int __io_msg_ring_data(struct io_ring_ctx *target_ctx, + struct io_msg *msg, unsigned int issue_flags) { - struct io_ring_ctx *target_ctx = req->file->private_data; - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; @@ -149,14 +148,14 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return -EBADFD; if (io_msg_need_remote(target_ctx)) - return io_msg_data_remote(req); + return io_msg_data_remote(target_ctx, msg); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; } if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) @@ -166,22 +165,32 @@ static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) return ret; } -static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) +static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_ring_ctx *target_ctx = req->file->private_data; + struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); + + return __io_msg_ring_data(target_ctx, msg, issue_flags); +} + +static int io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; - struct file *file = NULL; - int idx = msg->src_fd; + struct io_rsrc_node *node; + int ret = -EBADF; io_ring_submit_lock(ctx, issue_flags); - if (likely(idx < ctx->nr_user_files)) { - idx = array_index_nospec(idx, ctx->nr_user_files); - file = io_file_from_index(&ctx->file_table, idx); - if (file) - get_file(file); + node = io_rsrc_node_lookup(&ctx->file_table.data, msg->src_fd); + if (node) { + msg->src_file = io_slot_file(node); + if (msg->src_file) + get_file(msg->src_file); + req->flags |= REQ_F_NEED_CLEANUP; + ret = 0; } io_ring_submit_unlock(ctx, issue_flags); - return file; + return ret; } static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) @@ -191,7 +200,7 @@ static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flag struct file *src_file = msg->src_file; int ret; - if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) + if (unlikely(io_lock_external_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); @@ -250,7 +259,6 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; - struct file *src_file = msg->src_file; if (msg->len) return -EINVAL; @@ -258,12 +266,10 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; - if (!src_file) { - src_file = io_msg_grab_file(req, issue_flags); - if (!src_file) - return -EBADF; - msg->src_file = src_file; - req->flags |= REQ_F_NEED_CLEANUP; + if (!msg->src_file) { + int ret = io_msg_grab_file(req, issue_flags); + if (unlikely(ret)) + return ret; } if (io_msg_need_remote(target_ctx)) @@ -271,10 +277,8 @@ static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) return io_msg_install_complete(req, issue_flags); } -int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int __io_msg_ring_prep(struct io_msg *msg, const struct io_uring_sqe *sqe) { - struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); - if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; @@ -291,6 +295,11 @@ int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } +int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + return __io_msg_ring_prep(io_kiocb_to_cmd(req, struct io_msg), sqe); +} + int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); @@ -319,12 +328,30 @@ done: req_set_fail(req); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } -void io_msg_cache_free(const void *entry) +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe) { - struct io_kiocb *req = (struct io_kiocb *) entry; + struct io_msg io_msg = { }; + int ret; + + ret = __io_msg_ring_prep(&io_msg, sqe); + if (unlikely(ret)) + return ret; + + /* + * Only data sending supported, not IORING_MSG_SEND_FD as that one + * doesn't make sense without a source ring to send files from. + */ + if (io_msg.cmd != IORING_MSG_DATA) + return -EINVAL; - kmem_cache_free(req_cachep, req); + CLASS(fd, f)(sqe->fd); + if (fd_empty(f)) + return -EBADF; + if (!io_is_uring_fops(fd_file(f))) + return -EBADFD; + return __io_msg_ring_data(fd_file(f)->private_data, + &io_msg, IO_URING_F_UNLOCKED); } diff --git a/io_uring/msg_ring.h b/io_uring/msg_ring.h index 3030f3942f0f..32236d2fb778 100644 --- a/io_uring/msg_ring.h +++ b/io_uring/msg_ring.h @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +int io_uring_sync_msg_ring(struct io_uring_sqe *sqe); int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags); void io_msg_ring_cleanup(struct io_kiocb *req); -void io_msg_cache_free(const void *entry); diff --git a/io_uring/napi.c b/io_uring/napi.c index d0cf694d0172..4a10de03e426 100644 --- a/io_uring/napi.c +++ b/io_uring/napi.c @@ -38,67 +38,88 @@ static inline ktime_t net_to_ktime(unsigned long t) return ns_to_ktime(t << 10); } -void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock) +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) { struct hlist_head *hash_list; - unsigned int napi_id; - struct sock *sk; struct io_napi_entry *e; - sk = sock->sk; - if (!sk) - return; - - napi_id = READ_ONCE(sk->sk_napi_id); - /* Non-NAPI IDs can be rejected. */ - if (napi_id < MIN_NAPI_ID) - return; + if (!napi_id_valid(napi_id)) + return -EINVAL; hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; - rcu_read_lock(); - e = io_napi_hash_find(hash_list, napi_id); - if (e) { - e->timeout = jiffies + NAPI_TIMEOUT; - rcu_read_unlock(); - return; + scoped_guard(rcu) { + e = io_napi_hash_find(hash_list, napi_id); + if (e) { + WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); + return -EEXIST; + } } - rcu_read_unlock(); e = kmalloc(sizeof(*e), GFP_NOWAIT); if (!e) - return; + return -ENOMEM; e->napi_id = napi_id; e->timeout = jiffies + NAPI_TIMEOUT; + /* + * guard(spinlock) is not used to manually unlock it before calling + * kfree() + */ spin_lock(&ctx->napi_lock); if (unlikely(io_napi_hash_find(hash_list, napi_id))) { spin_unlock(&ctx->napi_lock); kfree(e); - return; + return -EEXIST; } hlist_add_tail_rcu(&e->node, hash_list); - list_add_tail(&e->list, &ctx->napi_list); + list_add_tail_rcu(&e->list, &ctx->napi_list); spin_unlock(&ctx->napi_lock); + return 0; +} + +static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) +{ + struct hlist_head *hash_list; + struct io_napi_entry *e; + + /* Non-NAPI IDs can be rejected. */ + if (!napi_id_valid(napi_id)) + return -EINVAL; + + hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; + guard(spinlock)(&ctx->napi_lock); + e = io_napi_hash_find(hash_list, napi_id); + if (!e) + return -ENOENT; + + list_del_rcu(&e->list); + hash_del_rcu(&e->node); + kfree_rcu(e, rcu); + return 0; } static void __io_napi_remove_stale(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - unsigned int i; - spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { - if (time_after(jiffies, e->timeout)) { - list_del(&e->list); + guard(spinlock)(&ctx->napi_lock); + /* + * list_for_each_entry_safe() is not required as long as: + * 1. list_del_rcu() does not reset the deleted node next pointer + * 2. kfree_rcu() delays the memory freeing until the next quiescent + * state + */ + list_for_each_entry(e, &ctx->napi_list, list) { + if (time_after(jiffies, READ_ONCE(e->timeout))) { + list_del_rcu(&e->list); hash_del_rcu(&e->node); kfree_rcu(e, rcu); } } - spin_unlock(&ctx->napi_lock); } static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) @@ -136,45 +157,73 @@ static bool io_napi_busy_loop_should_end(void *data, return false; } -static bool __io_napi_do_busy_loop(struct io_ring_ctx *ctx, - void *loop_end_arg) +/* + * never report stale entries + */ +static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + struct io_napi_entry *e; + + list_for_each_entry_rcu(e, &ctx->napi_list, list) + napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, + ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); + return false; +} + +static bool +dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { struct io_napi_entry *e; - bool (*loop_end)(void *, unsigned long) = NULL; bool is_stale = false; - if (loop_end_arg) - loop_end = io_napi_busy_loop_should_end; - list_for_each_entry_rcu(e, &ctx->napi_list, list) { napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); - if (time_after(jiffies, e->timeout)) + if (time_after(jiffies, READ_ONCE(e->timeout))) is_stale = true; } return is_stale; } +static inline bool +__io_napi_do_busy_loop(struct io_ring_ctx *ctx, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) +{ + if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) + return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); + return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); +} + static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { unsigned long start_time = busy_loop_current_time(); + bool (*loop_end)(void *, unsigned long) = NULL; void *loop_end_arg = NULL; bool is_stale = false; /* Singular lists use a different napi loop end check function and are * only executed once. */ - if (list_is_singular(&ctx->napi_list)) + if (list_is_singular(&ctx->napi_list)) { + loop_end = io_napi_busy_loop_should_end; loop_end_arg = iowq; + } - rcu_read_lock(); - do { - is_stale = __io_napi_do_busy_loop(ctx, loop_end_arg); - } while (!io_napi_busy_loop_should_end(iowq, start_time) && !loop_end_arg); - rcu_read_unlock(); + scoped_guard(rcu) { + do { + is_stale = __io_napi_do_busy_loop(ctx, loop_end, + loop_end_arg); + } while (!io_napi_busy_loop_should_end(iowq, start_time) && + !loop_end_arg); + } io_napi_remove_stale(ctx, is_stale); } @@ -193,6 +242,7 @@ void io_napi_init(struct io_ring_ctx *ctx) spin_lock_init(&ctx->napi_lock); ctx->napi_prefer_busy_poll = false; ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); + ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; } /* @@ -204,14 +254,31 @@ void io_napi_init(struct io_ring_ctx *ctx) void io_napi_free(struct io_ring_ctx *ctx) { struct io_napi_entry *e; - unsigned int i; - spin_lock(&ctx->napi_lock); - hash_for_each(ctx->napi_ht, i, e, node) { + guard(spinlock)(&ctx->napi_lock); + list_for_each_entry(e, &ctx->napi_list, list) { hash_del_rcu(&e->node); kfree_rcu(e, rcu); } - spin_unlock(&ctx->napi_lock); + INIT_LIST_HEAD_RCU(&ctx->napi_list); +} + +static int io_napi_register_napi(struct io_ring_ctx *ctx, + struct io_uring_napi *napi) +{ + switch (napi->op_param) { + case IO_URING_NAPI_TRACKING_DYNAMIC: + case IO_URING_NAPI_TRACKING_STATIC: + break; + default: + return -EINVAL; + } + /* clean the napi list for new settings */ + io_napi_free(ctx); + WRITE_ONCE(ctx->napi_track_mode, napi->op_param); + WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); + WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); + return 0; } /* @@ -225,7 +292,8 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { const struct io_uring_napi curr = { .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), - .prefer_busy_poll = ctx->napi_prefer_busy_poll + .prefer_busy_poll = ctx->napi_prefer_busy_poll, + .op_param = ctx->napi_track_mode }; struct io_uring_napi napi; @@ -233,16 +301,26 @@ int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (copy_from_user(&napi, arg, sizeof(napi))) return -EFAULT; - if (napi.pad[0] || napi.pad[1] || napi.pad[2] || napi.resv) + if (napi.pad[0] || napi.pad[1] || napi.resv) return -EINVAL; if (copy_to_user(arg, &curr, sizeof(curr))) return -EFAULT; - WRITE_ONCE(ctx->napi_busy_poll_dt, napi.busy_poll_to * NSEC_PER_USEC); - WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi.prefer_busy_poll); - WRITE_ONCE(ctx->napi_enabled, true); - return 0; + switch (napi.opcode) { + case IO_URING_NAPI_REGISTER_OP: + return io_napi_register_napi(ctx, &napi); + case IO_URING_NAPI_STATIC_ADD_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_add_id(ctx, napi.op_param); + case IO_URING_NAPI_STATIC_DEL_ID: + if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) + return -EINVAL; + return __io_napi_del_id(ctx, napi.op_param); + default: + return -EINVAL; + } } /* @@ -265,7 +343,7 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) WRITE_ONCE(ctx->napi_busy_poll_dt, 0); WRITE_ONCE(ctx->napi_prefer_busy_poll, false); - WRITE_ONCE(ctx->napi_enabled, false); + WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); return 0; } @@ -307,9 +385,9 @@ int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) if (list_empty_careful(&ctx->napi_list)) return 0; - rcu_read_lock(); - is_stale = __io_napi_do_busy_loop(ctx, NULL); - rcu_read_unlock(); + scoped_guard(rcu) { + is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); + } io_napi_remove_stale(ctx, is_stale); return 1; diff --git a/io_uring/napi.h b/io_uring/napi.h index fd275ef0456d..fa742f42e09b 100644 --- a/io_uring/napi.h +++ b/io_uring/napi.h @@ -15,7 +15,7 @@ void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); -void __io_napi_add(struct io_ring_ctx *ctx, struct socket *sock); +int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); @@ -44,12 +44,12 @@ static inline void io_napi_add(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct socket *sock; - if (!READ_ONCE(ctx->napi_enabled)) + if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); - if (sock) - __io_napi_add(ctx, sock); + if (sock && sock->sk) + __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else diff --git a/io_uring/net.c b/io_uring/net.c index 18507658a921..43a43522f406 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -16,8 +16,8 @@ #include "net.h" #include "notif.h" #include "rsrc.h" +#include "zcrx.h" -#if defined(CONFIG_NET) struct io_shutdown { struct file *file; int how; @@ -74,14 +74,18 @@ struct io_sr_msg { unsigned nr_multishot_loops; u16 flags; /* initialised and used only by !msg send variants */ - u16 addr_len; u16 buf_group; - void __user *addr; + unsigned short retry_flags; void __user *msg_control; /* used only for send zerocopy */ struct io_kiocb *notif; }; +enum sr_retry_flags { + IO_SR_MSG_RETRY = 1, + IO_SR_MSG_PARTIAL_MAP = 2, +}; + /* * Number of times we'll try and do receives if there's more data. If we * exceed this limit, then add us to the back of the queue and retry from @@ -89,6 +93,19 @@ struct io_sr_msg { */ #define MULTISHOT_MAX_RETRY 32 +struct io_recvzc { + struct file *file; + unsigned msg_flags; + u16 flags; + u32 len; + struct io_zcrx_ifq *ifq; +}; + +static int io_sg_from_iter_iovec(struct sk_buff *skb, + struct iov_iter *from, size_t length); +static int io_sg_from_iter(struct sk_buff *skb, + struct iov_iter *from, size_t length); + int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_shutdown *shutdown = io_kiocb_to_cmd(req, struct io_shutdown); @@ -116,7 +133,7 @@ int io_shutdown(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_shutdown_sock(sock, shutdown->how); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static bool io_net_retry(struct socket *sock, int flags) @@ -128,17 +145,13 @@ static bool io_net_retry(struct socket *sock, int flags) static void io_netmsg_iovec_free(struct io_async_msghdr *kmsg) { - if (kmsg->free_iov) { - kfree(kmsg->free_iov); - kmsg->free_iov_nr = 0; - kmsg->free_iov = NULL; - } + if (kmsg->vec.iovec) + io_vec_free(&kmsg->vec); } static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_msghdr *hdr = req->async_data; - struct iovec *iov; /* can't recycle, ensure we free the iovec if we have one */ if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { @@ -147,12 +160,13 @@ static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags) } /* Let normal cleanup path reap it if we fail adding to the cache */ - iov = hdr->free_iov; + io_alloc_cache_vec_kasan(&hdr->vec); + if (hdr->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&hdr->vec); + if (io_alloc_cache_put(&req->ctx->netmsg_cache, hdr)) { - if (iov) - kasan_mempool_poison_object(iov); req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } @@ -161,39 +175,14 @@ static struct io_async_msghdr *io_msg_alloc_async(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct io_async_msghdr *hdr; - hdr = io_alloc_cache_get(&ctx->netmsg_cache); - if (hdr) { - if (hdr->free_iov) { - kasan_mempool_unpoison_object(hdr->free_iov, - hdr->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = hdr; - return hdr; - } - - if (!io_alloc_async_data(req)) { - hdr = req->async_data; - hdr->free_iov_nr = 0; - hdr->free_iov = NULL; - return hdr; - } - return NULL; -} + hdr = io_uring_alloc_async_data(&ctx->netmsg_cache, req); + if (!hdr) + return NULL; -/* assign new iovec to kmsg, if we need to */ -static int io_net_vec_assign(struct io_kiocb *req, struct io_async_msghdr *kmsg, - struct iovec *iov) -{ - if (iov) { + /* If the async data was cached, we might have an iov cached inside. */ + if (hdr->vec.iovec) req->flags |= REQ_F_NEED_CLEANUP; - kmsg->free_iov_nr = kmsg->msg.msg_iter.nr_segs; - if (kmsg->free_iov) - kfree(kmsg->free_iov); - kmsg->free_iov = iov; - } - return 0; + return hdr; } static inline void io_mshot_prep_retry(struct io_kiocb *req, @@ -203,150 +192,139 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req, req->flags &= ~REQ_F_BL_EMPTY; sr->done_io = 0; + sr->retry_flags = 0; sr->len = 0; /* get from the provided buffer */ - req->buf_index = sr->buf_group; } -#ifdef CONFIG_COMPAT -static int io_compat_msg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg, - struct compat_msghdr *msg, int ddir) +static int io_net_import_vec(struct io_kiocb *req, struct io_async_msghdr *iomsg, + const struct iovec __user *uiov, unsigned uvec_seg, + int ddir) { - struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct compat_iovec __user *uiov; struct iovec *iov; int ret, nr_segs; - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; + if (iomsg->vec.iovec) { + nr_segs = iomsg->vec.nr; + iov = iomsg->vec.iovec; } else { - iov = &iomsg->fast_iov; nr_segs = 1; + iov = &iomsg->fast_iov; } - if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) - return -EFAULT; - - uiov = compat_ptr(msg->msg_iov); - if (req->flags & REQ_F_BUFFER_SELECT) { - compat_ssize_t clen; - - if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; - } else if (msg->msg_iovlen > 1) { - return -EINVAL; - } else { - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) - return -EFAULT; - if (clen < 0) - return -EINVAL; - sr->len = clen; - } - - return 0; - } - - ret = __import_iovec(ddir, (struct iovec __user *)uiov, msg->msg_iovlen, - nr_segs, &iov, &iomsg->msg.msg_iter, true); + ret = __import_iovec(ddir, uiov, uvec_seg, nr_segs, &iov, + &iomsg->msg.msg_iter, io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; - return io_net_vec_assign(req, iomsg, iov); + if (iov) { + req->flags |= REQ_F_NEED_CLEANUP; + io_vec_reset_iovec(&iomsg->vec, iov, iomsg->msg.msg_iter.nr_segs); + } + return 0; } -#endif -static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, - struct user_msghdr *msg, int ddir) +static int io_compat_msg_copy_hdr(struct io_kiocb *req, + struct io_async_msghdr *iomsg, + struct compat_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct iovec *iov; - int ret, nr_segs; - - if (iomsg->free_iov) { - nr_segs = iomsg->free_iov_nr; - iov = iomsg->free_iov; - } else { - iov = &iomsg->fast_iov; - nr_segs = 1; - } + struct compat_iovec __user *uiov; + int ret; - if (!user_access_begin(sr->umsg, sizeof(*sr->umsg))) + if (copy_from_user(msg, sr->umsg_compat, sizeof(*msg))) return -EFAULT; - ret = -EFAULT; - unsafe_get_user(msg->msg_name, &sr->umsg->msg_name, ua_end); - unsafe_get_user(msg->msg_namelen, &sr->umsg->msg_namelen, ua_end); - unsafe_get_user(msg->msg_iov, &sr->umsg->msg_iov, ua_end); - unsafe_get_user(msg->msg_iovlen, &sr->umsg->msg_iovlen, ua_end); - unsafe_get_user(msg->msg_control, &sr->umsg->msg_control, ua_end); - unsafe_get_user(msg->msg_controllen, &sr->umsg->msg_controllen, ua_end); - msg->msg_flags = 0; + ret = __get_compat_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + uiov = compat_ptr(msg->msg_iov); if (req->flags & REQ_F_BUFFER_SELECT) { if (msg->msg_iovlen == 0) { - sr->len = iov->iov_len = 0; - iov->iov_base = NULL; + sr->len = 0; } else if (msg->msg_iovlen > 1) { - ret = -EINVAL; - goto ua_end; + return -EINVAL; } else { - /* we only need the length for provided buffers */ - if (!access_ok(&msg->msg_iov[0].iov_len, sizeof(__kernel_size_t))) - goto ua_end; - unsafe_get_user(iov->iov_len, &msg->msg_iov[0].iov_len, - ua_end); - sr->len = iov->iov_len; + struct compat_iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) + return -EFAULT; + sr->len = tmp_iov.iov_len; } - ret = 0; -ua_end: - user_access_end(); - return ret; } + return 0; +} +static int io_copy_msghdr_from_user(struct user_msghdr *msg, + struct user_msghdr __user *umsg) +{ + if (!user_access_begin(umsg, sizeof(*umsg))) + return -EFAULT; + unsafe_get_user(msg->msg_name, &umsg->msg_name, ua_end); + unsafe_get_user(msg->msg_namelen, &umsg->msg_namelen, ua_end); + unsafe_get_user(msg->msg_iov, &umsg->msg_iov, ua_end); + unsafe_get_user(msg->msg_iovlen, &umsg->msg_iovlen, ua_end); + unsafe_get_user(msg->msg_control, &umsg->msg_control, ua_end); + unsafe_get_user(msg->msg_controllen, &umsg->msg_controllen, ua_end); user_access_end(); - ret = __import_iovec(ddir, msg->msg_iov, msg->msg_iovlen, nr_segs, - &iov, &iomsg->msg.msg_iter, false); - if (unlikely(ret < 0)) - return ret; - - return io_net_vec_assign(req, iomsg, iov); + return 0; +ua_end: + user_access_end(); + return -EFAULT; } -static int io_sendmsg_copy_hdr(struct io_kiocb *req, - struct io_async_msghdr *iomsg) +static int io_msg_copy_hdr(struct io_kiocb *req, struct io_async_msghdr *iomsg, + struct user_msghdr *msg, int ddir, + struct sockaddr __user **save_addr) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - struct user_msghdr msg; + struct user_msghdr __user *umsg = sr->umsg; int ret; iomsg->msg.msg_name = &iomsg->addr; iomsg->msg.msg_iter.nr_segs = 0; -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { + if (io_is_compat(req->ctx)) { struct compat_msghdr cmsg; - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_SOURCE); - if (unlikely(ret)) + ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ddir, save_addr); + if (ret) return ret; - return __get_compat_msghdr(&iomsg->msg, &cmsg, NULL); + memset(msg, 0, sizeof(*msg)); + msg->msg_namelen = cmsg.msg_namelen; + msg->msg_controllen = cmsg.msg_controllen; + msg->msg_iov = compat_ptr(cmsg.msg_iov); + msg->msg_iovlen = cmsg.msg_iovlen; + return 0; } -#endif - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_SOURCE); + ret = io_copy_msghdr_from_user(msg, umsg); if (unlikely(ret)) return ret; - ret = __copy_msghdr(&iomsg->msg, &msg, NULL); + msg->msg_flags = 0; - /* save msg_control as sys_sendmsg() overwrites it */ - sr->msg_control = iomsg->msg.msg_control_user; - return ret; + ret = __copy_msghdr(&iomsg->msg, msg, save_addr); + if (ret) + return ret; + + if (req->flags & REQ_F_BUFFER_SELECT) { + if (msg->msg_iovlen == 0) { + sr->len = 0; + } else if (msg->msg_iovlen > 1) { + return -EINVAL; + } else { + struct iovec __user *uiov = msg->msg_iov; + struct iovec tmp_iov; + + if (copy_from_user(&tmp_iov, uiov, sizeof(tmp_iov))) + return -EFAULT; + sr->len = tmp_iov.iov_len; + } + } + return 0; } void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) @@ -356,48 +334,65 @@ void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req) io_netmsg_iovec_free(io); } -static int io_send_setup(struct io_kiocb *req) +static int io_send_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg = req->async_data; + void __user *addr; + u16 addr_len; int ret; + sr->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + + if (READ_ONCE(sqe->__pad3[0])) + return -EINVAL; + kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_controllen = 0; kmsg->msg.msg_ubuf = NULL; - if (sr->addr) { - ret = move_addr_to_kernel(sr->addr, sr->addr_len, &kmsg->addr); + addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + addr_len = READ_ONCE(sqe->addr_len); + if (addr) { + ret = move_addr_to_kernel(addr, addr_len, &kmsg->addr); if (unlikely(ret < 0)) return ret; kmsg->msg.msg_name = &kmsg->addr; - kmsg->msg.msg_namelen = sr->addr_len; + kmsg->msg.msg_namelen = addr_len; } - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret < 0)) - return ret; + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); } -static int io_sendmsg_prep_setup(struct io_kiocb *req, int is_msg) +static int io_sendmsg_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_async_msghdr *kmsg; + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *kmsg = req->async_data; + struct user_msghdr msg; int ret; - kmsg = io_msg_alloc_async(req); - if (unlikely(!kmsg)) - return -ENOMEM; - if (!is_msg) - return io_send_setup(req); - ret = io_sendmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + ret = io_msg_copy_hdr(req, kmsg, &msg, ITER_SOURCE, NULL); + if (unlikely(ret)) + return ret; + /* save msg_control as sys_sendmsg() overwrites it */ + sr->msg_control = kmsg->msg.msg_control_user; + + if (sr->flags & IORING_RECVSEND_FIXED_BUF) { + kmsg->msg.msg_iter.nr_segs = msg.msg_iovlen; + return io_prep_reg_iovec(req, &kmsg->vec, msg.msg_iov, + msg.msg_iovlen); + } + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return io_net_import_vec(req, kmsg, msg.msg_iov, msg.msg_iovlen, ITER_SOURCE); } #define SENDMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECVSEND_BUNDLE) @@ -407,17 +402,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; - - if (req->opcode == IORING_OP_SEND) { - if (READ_ONCE(sqe->__pad3[0])) - return -EINVAL; - sr->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - sr->addr_len = READ_ONCE(sqe->addr_len); - } else if (sqe->addr2 || sqe->file_index) { - return -EINVAL; - } - - sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr)); + sr->retry_flags = 0; sr->len = READ_ONCE(sqe->len); sr->flags = READ_ONCE(sqe->ioprio); if (sr->flags & ~SENDMSG_FLAGS) @@ -425,27 +410,31 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (sr->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; + if (req->flags & REQ_F_BUFFER_SELECT) + sr->buf_group = req->buf_index; if (sr->flags & IORING_RECVSEND_BUNDLE) { if (req->opcode == IORING_OP_SENDMSG) return -EINVAL; - if (!(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; sr->msg_flags |= MSG_WAITALL; - sr->buf_group = req->buf_index; req->buf_list = NULL; + req->flags |= REQ_F_MULTISHOT; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG); + + if (unlikely(!io_msg_alloc_async(req))) + return -ENOMEM; + if (req->opcode != IORING_OP_SENDMSG) + return io_send_setup(req, sqe); + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; + return io_sendmsg_setup(req, sqe); } static void io_req_msg_cleanup(struct io_kiocb *req, unsigned int issue_flags) { - req->flags &= ~REQ_F_NEED_CLEANUP; io_netmsg_recycle(req, issue_flags); } @@ -468,7 +457,7 @@ static int io_bundle_nbufs(struct io_async_msghdr *kmsg, int ret) if (iter_is_ubuf(&kmsg->msg.msg_iter)) return 1; - iov = kmsg->free_iov; + iov = kmsg->vec.iovec; if (!iov) iov = &kmsg->fast_iov; @@ -518,7 +507,7 @@ static inline bool io_send_finish(struct io_kiocb *req, int *ret, /* Otherwise stop bundle and use the current result. */ finish: io_req_set_res(req, *ret, cflags); - *ret = IOU_OK; + *ret = IOU_COMPLETE; return true; } @@ -569,7 +558,56 @@ int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) else if (sr->done_io) ret = sr->done_io; io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +static int io_send_select_buffer(struct io_kiocb *req, unsigned int issue_flags, + struct io_async_msghdr *kmsg) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + + int ret; + struct buf_sel_arg arg = { + .iovs = &kmsg->fast_iov, + .max_len = min_not_zero(sr->len, INT_MAX), + .nr_iovs = 1, + .buf_group = sr->buf_group, + }; + + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; + arg.mode = KBUF_MODE_FREE; + } + + if (!(sr->flags & IORING_RECVSEND_BUNDLE)) + arg.nr_iovs = 1; + else + arg.mode |= KBUF_MODE_EXPAND; + + ret = io_buffers_select(req, &arg, issue_flags); + if (unlikely(ret < 0)) + return ret; + + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + sr->len = arg.out_len; + + if (ret == 1) { + sr->buf = arg.iovs[0].iov_base; + ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, + &kmsg->msg.msg_iter); + if (unlikely(ret)) + return ret; + } else { + iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, + arg.iovs, ret, arg.out_len); + } + + return 0; } int io_send(struct io_kiocb *req, unsigned int issue_flags) @@ -595,44 +633,9 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) retry_bundle: if (io_do_buffer_select(req)) { - struct buf_sel_arg arg = { - .iovs = &kmsg->fast_iov, - .max_len = min_not_zero(sr->len, INT_MAX), - .nr_iovs = 1, - }; - - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; - arg.mode = KBUF_MODE_FREE; - } - - if (!(sr->flags & IORING_RECVSEND_BUNDLE)) - arg.nr_iovs = 1; - else - arg.mode |= KBUF_MODE_EXPAND; - - ret = io_buffers_select(req, &arg, issue_flags); - if (unlikely(ret < 0)) + ret = io_send_select_buffer(req, issue_flags, kmsg); + if (ret) return ret; - - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } - sr->len = arg.out_len; - - if (ret == 1) { - sr->buf = arg.iovs[0].iov_base; - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } else { - iov_iter_init(&kmsg->msg.msg_iter, ITER_SOURCE, - arg.iovs, ret, arg.out_len); - } } /* @@ -703,34 +706,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct user_msghdr msg; int ret; - iomsg->msg.msg_name = &iomsg->addr; - iomsg->msg.msg_iter.nr_segs = 0; - -#ifdef CONFIG_COMPAT - if (unlikely(req->ctx->compat)) { - struct compat_msghdr cmsg; - - ret = io_compat_msg_copy_hdr(req, iomsg, &cmsg, ITER_DEST); - if (unlikely(ret)) - return ret; + ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST, &iomsg->uaddr); + if (unlikely(ret)) + return ret; - ret = __get_compat_msghdr(&iomsg->msg, &cmsg, &iomsg->uaddr); + if (!(req->flags & REQ_F_BUFFER_SELECT)) { + ret = io_net_import_vec(req, iomsg, msg.msg_iov, msg.msg_iovlen, + ITER_DEST); if (unlikely(ret)) return ret; - - return io_recvmsg_mshot_prep(req, iomsg, cmsg.msg_namelen, - cmsg.msg_controllen); } -#endif - - ret = io_msg_copy_hdr(req, iomsg, &msg, ITER_DEST); - if (unlikely(ret)) - return ret; - - ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr); - if (unlikely(ret)) - return ret; - return io_recvmsg_mshot_prep(req, iomsg, msg.msg_namelen, msg.msg_controllen); } @@ -739,7 +724,6 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_async_msghdr *kmsg; - int ret; kmsg = io_msg_alloc_async(req); if (unlikely(!kmsg)) @@ -748,25 +732,20 @@ static int io_recvmsg_prep_setup(struct io_kiocb *req) if (req->opcode == IORING_OP_RECV) { kmsg->msg.msg_name = NULL; kmsg->msg.msg_namelen = 0; + kmsg->msg.msg_inq = 0; kmsg->msg.msg_control = NULL; kmsg->msg.msg_get_inq = 1; kmsg->msg.msg_controllen = 0; kmsg->msg.msg_iocb = NULL; kmsg->msg.msg_ubuf = NULL; - if (!io_do_buffer_select(req)) { - ret = import_ubuf(ITER_DEST, sr->buf, sr->len, - &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - } - return 0; + if (req->flags & REQ_F_BUFFER_SELECT) + return 0; + return import_ubuf(ITER_DEST, sr->buf, sr->len, + &kmsg->msg.msg_iter); } - ret = io_recvmsg_copy_hdr(req, kmsg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return io_recvmsg_copy_hdr(req, kmsg); } #define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT | \ @@ -777,6 +756,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); sr->done_io = 0; + sr->retry_flags = 0; if (unlikely(sqe->file_index || sqe->addr2)) return -EINVAL; @@ -817,14 +797,16 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return -EINVAL; } -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) sr->msg_flags |= MSG_CMSG_COMPAT; -#endif + sr->nr_multishot_loops = 0; return io_recvmsg_prep_setup(req); } +/* bits to clear in old and inherit in new cflags on bundle retry */ +#define CQE_F_MASK (IORING_CQE_F_SOCK_NONEMPTY|IORING_CQE_F_MORE) + /* * Finishes io_recv and io_recvmsg. * @@ -842,11 +824,27 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, cflags |= IORING_CQE_F_SOCK_NONEMPTY; if (sr->flags & IORING_RECVSEND_BUNDLE) { - cflags |= io_put_kbufs(req, *ret, io_bundle_nbufs(kmsg, *ret), + size_t this_ret = *ret - sr->done_io; + + cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret), issue_flags); + if (sr->retry_flags & IO_SR_MSG_RETRY) + cflags = req->cqe.flags | (cflags & CQE_F_MASK); /* bundle with no more immediate buffers, we're done */ if (req->flags & REQ_F_BL_EMPTY) goto finish; + /* + * If more is available AND it was a full transfer, retry and + * append to this one + */ + if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 && + !iov_iter_count(&kmsg->msg.msg_iter)) { + req->cqe.flags = cflags & ~CQE_F_MASK; + sr->len = kmsg->msg.msg_inq; + sr->done_io += this_ret; + sr->retry_flags |= IO_SR_MSG_RETRY; + return false; + } } else { cflags |= io_put_kbuf(req, *ret, issue_flags); } @@ -857,8 +855,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, */ if ((req->flags & REQ_F_APOLL_MULTISHOT) && !mshot_finished && io_req_post_cqe(req, *ret, cflags | IORING_CQE_F_MORE)) { - int mshot_retry_ret = IOU_ISSUE_SKIP_COMPLETE; - + *ret = IOU_RETRY; io_mshot_prep_retry(req, kmsg); /* Known not-empty or unknown state, retry */ if (cflags & IORING_CQE_F_SOCK_NONEMPTY || kmsg->msg.msg_inq < 0) { @@ -866,23 +863,16 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret, return false; /* mshot retries exceeded, force a requeue */ sr->nr_multishot_loops = 0; - mshot_retry_ret = IOU_REQUEUE; + if (issue_flags & IO_URING_F_MULTISHOT) + *ret = IOU_REQUEUE; } - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = mshot_retry_ret; - else - *ret = -EAGAIN; return true; } /* Finish the request / stop multishot. */ finish: io_req_set_res(req, *ret, cflags); - - if (issue_flags & IO_URING_F_MULTISHOT) - *ret = IOU_STOP_MULTISHOT; - else - *ret = IOU_OK; + *ret = IOU_COMPLETE; io_req_msg_cleanup(req, issue_flags); return true; } @@ -998,7 +988,7 @@ retry_multishot: void __user *buf; size_t len = sr->len; - buf = io_buffer_select(req, &len, issue_flags); + buf = io_buffer_select(req, &len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; @@ -1029,16 +1019,15 @@ retry_multishot: if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->done_io += ret; req->flags |= REQ_F_BL_NO_RECYCLE; - return -EAGAIN; + return IOU_RETRY; } if (ret == -ERESTARTSYS) ret = -EINTR; @@ -1077,21 +1066,30 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg .iovs = &kmsg->fast_iov, .nr_iovs = 1, .mode = KBUF_MODE_EXPAND, + .buf_group = sr->buf_group, }; - if (kmsg->free_iov) { - arg.nr_iovs = kmsg->free_iov_nr; - arg.iovs = kmsg->free_iov; + if (kmsg->vec.iovec) { + arg.nr_iovs = kmsg->vec.nr; + arg.iovs = kmsg->vec.iovec; arg.mode |= KBUF_MODE_FREE; } - if (kmsg->msg.msg_inq > 0) + if (kmsg->msg.msg_inq > 1) arg.max_len = min_not_zero(sr->len, kmsg->msg.msg_inq); ret = io_buffers_peek(req, &arg); if (unlikely(ret < 0)) return ret; + if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) { + kmsg->vec.nr = ret; + kmsg->vec.iovec = arg.iovs; + req->flags |= REQ_F_NEED_CLEANUP; + } + if (arg.partial_map) + sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP; + /* special case 1 vec, can be a fast path */ if (ret == 1) { sr->buf = arg.iovs[0].iov_base; @@ -1100,16 +1098,11 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg } iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret, arg.out_len); - if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->free_iov) { - kmsg->free_iov_nr = ret; - kmsg->free_iov = arg.iovs; - req->flags |= REQ_F_NEED_CLEANUP; - } } else { void __user *buf; *len = sr->len; - buf = io_buffer_select(req, len, issue_flags); + buf = io_buffer_select(req, len, sr->buf_group, issue_flags); if (!buf) return -ENOBUFS; sr->buf = buf; @@ -1166,12 +1159,10 @@ retry_multishot: ret = sock_recvmsg(sock, &kmsg->msg, flags); if (ret < min_ret) { if (ret == -EAGAIN && force_nonblock) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) io_kbuf_recycle(req, issue_flags); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + return IOU_RETRY; } if (ret > 0 && io_net_retry(sock, flags)) { sr->len -= ret; @@ -1202,6 +1193,71 @@ out_free: return ret; } +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + unsigned ifq_idx; + + if (unlikely(sqe->addr2 || sqe->addr || sqe->addr3)) + return -EINVAL; + + ifq_idx = READ_ONCE(sqe->zcrx_ifq_idx); + zc->ifq = xa_load(&req->ctx->zcrx_ctxs, ifq_idx); + if (!zc->ifq) + return -EINVAL; + + zc->len = READ_ONCE(sqe->len); + zc->flags = READ_ONCE(sqe->ioprio); + zc->msg_flags = READ_ONCE(sqe->msg_flags); + if (zc->msg_flags) + return -EINVAL; + if (zc->flags & ~(IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)) + return -EINVAL; + /* multishot required */ + if (!(zc->flags & IORING_RECV_MULTISHOT)) + return -EINVAL; + /* All data completions are posted as aux CQEs. */ + req->flags |= REQ_F_APOLL_MULTISHOT; + + return 0; +} + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_recvzc *zc = io_kiocb_to_cmd(req, struct io_recvzc); + struct socket *sock; + unsigned int len; + int ret; + + if (!(req->flags & REQ_F_POLLED) && + (zc->flags & IORING_RECVSEND_POLL_FIRST)) + return -EAGAIN; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + len = zc->len; + ret = io_zcrx_recv(req, zc->ifq, sock, zc->msg_flags | MSG_DONTWAIT, + issue_flags, &zc->len); + if (len && zc->len == 0) { + io_req_set_res(req, 0, 0); + + return IOU_COMPLETE; + } + if (unlikely(ret <= 0) && ret != -EAGAIN) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + if (ret == IOU_REQUEUE) + return IOU_REQUEUE; + + req_set_fail(req); + io_req_set_res(req, ret, 0); + return IOU_COMPLETE; + } + return IOU_RETRY; +} + void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -1222,10 +1278,12 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); struct io_ring_ctx *ctx = req->ctx; + struct io_async_msghdr *iomsg; struct io_kiocb *notif; + int ret; zc->done_io = 0; - req->flags |= REQ_F_POLL_NO_LAZY; + zc->retry_flags = 0; if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; @@ -1239,7 +1297,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) notif->cqe.user_data = req->cqe.user_data; notif->cqe.res = 0; notif->cqe.flags = IORING_CQE_F_NOTIF; - req->flags |= REQ_F_NEED_CLEANUP; + req->flags |= REQ_F_NEED_CLEANUP | REQ_F_POLL_NO_LAZY; zc->flags = READ_ONCE(sqe->ioprio); if (unlikely(zc->flags & ~IO_ZC_FLAGS_COMMON)) { @@ -1254,39 +1312,35 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } } - if (zc->flags & IORING_RECVSEND_FIXED_BUF) { - unsigned idx = READ_ONCE(sqe->buf_index); + zc->len = READ_ONCE(sqe->len); + zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; + req->buf_index = READ_ONCE(sqe->buf_index); + if (zc->msg_flags & MSG_DONTWAIT) + req->flags |= REQ_F_NOWAIT; + + if (io_is_compat(req->ctx)) + zc->msg_flags |= MSG_CMSG_COMPAT; - if (unlikely(idx >= ctx->nr_user_bufs)) - return -EFAULT; - idx = array_index_nospec(idx, ctx->nr_user_bufs); - req->imu = READ_ONCE(ctx->user_bufs[idx]); - io_req_set_rsrc_node(notif, ctx, 0); - } + iomsg = io_msg_alloc_async(req); + if (unlikely(!iomsg)) + return -ENOMEM; if (req->opcode == IORING_OP_SEND_ZC) { - if (READ_ONCE(sqe->__pad3[0])) - return -EINVAL; - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - zc->addr_len = READ_ONCE(sqe->addr_len); + ret = io_send_setup(req, sqe); } else { if (unlikely(sqe->addr2 || sqe->file_index)) return -EINVAL; - if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) - return -EINVAL; + ret = io_sendmsg_setup(req, sqe); } + if (unlikely(ret)) + return ret; - zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); - zc->len = READ_ONCE(sqe->len); - zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL | MSG_ZEROCOPY; - if (zc->msg_flags & MSG_DONTWAIT) - req->flags |= REQ_F_NOWAIT; - -#ifdef CONFIG_COMPAT - if (req->ctx->compat) - zc->msg_flags |= MSG_CMSG_COMPAT; -#endif - return io_sendmsg_prep_setup(req, req->opcode == IORING_OP_SENDMSG_ZC); + if (!(zc->flags & IORING_RECVSEND_FIXED_BUF)) { + iomsg->msg.sg_from_iter = io_sg_from_iter_iovec; + return io_notif_account_mem(zc->notif, iomsg->msg.msg_iter.count); + } + iomsg->msg.sg_from_iter = io_sg_from_iter; + return 0; } static int io_sg_from_iter_iovec(struct sk_buff *skb, @@ -1339,28 +1393,17 @@ static int io_sg_from_iter(struct sk_buff *skb, return ret; } -static int io_send_zc_import(struct io_kiocb *req, struct io_async_msghdr *kmsg) +static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); - int ret; + struct io_async_msghdr *kmsg = req->async_data; - if (sr->flags & IORING_RECVSEND_FIXED_BUF) { - ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, req->imu, - (u64)(uintptr_t)sr->buf, sr->len); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter; - } else { - ret = import_ubuf(ITER_SOURCE, sr->buf, sr->len, &kmsg->msg.msg_iter); - if (unlikely(ret)) - return ret; - ret = io_notif_account_mem(sr->notif, sr->len); - if (unlikely(ret)) - return ret; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; - } + WARN_ON_ONCE(!(sr->flags & IORING_RECVSEND_FIXED_BUF)); - return ret; + sr->notif->buf_index = req->buf_index; + return io_import_reg_buf(sr->notif, &kmsg->msg.msg_iter, + (u64)(uintptr_t)sr->buf, sr->len, + ITER_SOURCE, issue_flags); } int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1381,8 +1424,9 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) (zc->flags & IORING_RECVSEND_POLL_FIRST)) return -EAGAIN; - if (!zc->done_io) { - ret = io_send_zc_import(req, kmsg); + if (req->flags & REQ_F_IMPORT_BUFFER) { + req->flags &= ~REQ_F_IMPORT_BUFFER; + ret = io_send_zc_import(req, issue_flags); if (unlikely(ret)) return ret; } @@ -1425,10 +1469,11 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(zc->notif); + zc->notif = NULL; io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) @@ -1439,6 +1484,17 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) unsigned flags; int ret, min_ret = 0; + if (req->flags & REQ_F_IMPORT_BUFFER) { + unsigned uvec_segs = kmsg->msg.msg_iter.nr_segs; + int ret; + + ret = io_import_reg_vec(ITER_SOURCE, &kmsg->msg.msg_iter, req, + &kmsg->vec, uvec_segs, issue_flags); + if (unlikely(ret)) + return ret; + req->flags &= ~REQ_F_IMPORT_BUFFER; + } + sock = sock_from_file(req->file); if (unlikely(!sock)) return -ENOTSOCK; @@ -1457,7 +1513,6 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) kmsg->msg.msg_control_user = sr->msg_control; kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; - kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); if (unlikely(ret < min_ret)) { @@ -1485,10 +1540,11 @@ int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { io_notif_flush(sr->notif); + sr->notif = NULL; io_req_msg_cleanup(req, 0); } io_req_set_res(req, ret, IORING_CQE_F_MORE); - return IOU_OK; + return IOU_COMPLETE; } void io_sendrecv_fail(struct io_kiocb *req) @@ -1571,19 +1627,11 @@ retry: put_unused_fd(fd); ret = PTR_ERR(file); if (ret == -EAGAIN && force_nonblock && - !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) { - /* - * if it's multishot and polled, we don't need to - * return EAGAIN to arm the poll infra since it - * has already been done - */ - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return ret; - } + !(accept->iou_flags & IORING_ACCEPT_DONTWAIT)) + return IOU_RETRY; + if (ret == -ERESTARTSYS) ret = -EINTR; - req_set_fail(req); } else if (!fixed) { fd_install(fd, file); ret = fd; @@ -1596,23 +1644,17 @@ retry: if (!arg.is_empty) cflags |= IORING_CQE_F_SOCK_NONEMPTY; - if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { - io_req_set_res(req, ret, cflags); - return IOU_OK; - } - - if (ret < 0) - return ret; - if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { + if (ret >= 0 && (req->flags & REQ_F_APOLL_MULTISHOT) && + io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { if (cflags & IORING_CQE_F_SOCK_NONEMPTY || arg.is_empty == -1) goto retry; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } io_req_set_res(req, ret, cflags); - return IOU_STOP_MULTISHOT; + if (ret < 0) + req_set_fail(req); + return IOU_COMPLETE; } int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1666,7 +1708,7 @@ int io_socket(struct io_kiocb *req, unsigned int issue_flags) sock->file_slot); } io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1696,6 +1738,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + if (unlikely(req->flags & REQ_F_FAIL)) { + ret = -ECONNRESET; + goto out; + } + file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_connect_file(req->file, &io->addr, connect->addr_len, @@ -1728,7 +1775,7 @@ out: req_set_fail(req); io_req_msg_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_bind_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -1799,11 +1846,6 @@ void io_netmsg_cache_free(const void *entry) { struct io_async_msghdr *kmsg = (struct io_async_msghdr *) entry; - if (kmsg->free_iov) { - kasan_mempool_unpoison_object(kmsg->free_iov, - kmsg->free_iov_nr * sizeof(struct iovec)); - io_netmsg_iovec_free(kmsg); - } + io_vec_free(&kmsg->vec); kfree(kmsg); } -#endif diff --git a/io_uring/net.h b/io_uring/net.h index 52bfee05f06a..43e5ce5416b7 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -2,19 +2,23 @@ #include <linux/net.h> #include <linux/uio.h> +#include <linux/io_uring_types.h> struct io_async_msghdr { #if defined(CONFIG_NET) - struct iovec fast_iov; - /* points to an allocated iov, if NULL we use fast_iov instead */ - struct iovec *free_iov; - int free_iov_nr; - int namelen; - __kernel_size_t controllen; - __kernel_size_t payloadlen; - struct sockaddr __user *uaddr; - struct msghdr msg; - struct sockaddr_storage addr; + struct iou_vec vec; + + struct_group(clear, + int namelen; + struct iovec fast_iov; + __kernel_size_t controllen; + __kernel_size_t payloadlen; + struct sockaddr __user *uaddr; + struct msghdr msg; + struct sockaddr_storage addr; + ); +#else + struct_group(clear); #endif }; diff --git a/io_uring/nop.c b/io_uring/nop.c index a5bcf3d6984f..6ac2de761fd3 100644 --- a/io_uring/nop.c +++ b/io_uring/nop.c @@ -8,36 +8,65 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "rsrc.h" #include "nop.h" struct io_nop { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct file *file; int result; + int fd; + unsigned int flags; }; +#define NOP_FLAGS (IORING_NOP_INJECT_RESULT | IORING_NOP_FIXED_FILE | \ + IORING_NOP_FIXED_BUFFER | IORING_NOP_FILE) + int io_nop_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - unsigned int flags; struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); - flags = READ_ONCE(sqe->nop_flags); - if (flags & ~IORING_NOP_INJECT_RESULT) + nop->flags = READ_ONCE(sqe->nop_flags); + if (nop->flags & ~NOP_FLAGS) return -EINVAL; - if (flags & IORING_NOP_INJECT_RESULT) + if (nop->flags & IORING_NOP_INJECT_RESULT) nop->result = READ_ONCE(sqe->len); else nop->result = 0; + if (nop->flags & IORING_NOP_FILE) + nop->fd = READ_ONCE(sqe->fd); + else + nop->fd = -1; + if (nop->flags & IORING_NOP_FIXED_BUFFER) + req->buf_index = READ_ONCE(sqe->buf_index); return 0; } int io_nop(struct io_kiocb *req, unsigned int issue_flags) { struct io_nop *nop = io_kiocb_to_cmd(req, struct io_nop); + int ret = nop->result; - if (nop->result < 0) + if (nop->flags & IORING_NOP_FILE) { + if (nop->flags & IORING_NOP_FIXED_FILE) { + req->file = io_file_get_fixed(req, nop->fd, issue_flags); + req->flags |= REQ_F_FIXED_FILE; + } else { + req->file = io_file_get_normal(req, nop->fd); + } + if (!req->file) { + ret = -EBADF; + goto done; + } + } + if (nop->flags & IORING_NOP_FIXED_BUFFER) { + if (!io_find_buf_node(req, issue_flags)) + ret = -EFAULT; + } +done: + if (ret < 0) req_set_fail(req); io_req_set_res(req, nop->result, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/notif.c b/io_uring/notif.c index 28859ae3ee6e..9a6f6e92d742 100644 --- a/io_uring/notif.c +++ b/io_uring/notif.c @@ -11,7 +11,7 @@ static const struct ubuf_info_ops io_ubuf_ops; -static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) +static void io_notif_tw_complete(struct io_kiocb *notif, io_tw_token_t tw) { struct io_notif_data *nd = io_notif_to_data(notif); @@ -29,7 +29,7 @@ static void io_notif_tw_complete(struct io_kiocb *notif, struct io_tw_state *ts) } nd = nd->next; - io_req_task_complete(notif, ts); + io_req_task_complete(notif, tw); } while (nd); } @@ -89,7 +89,7 @@ static int io_link_skb(struct sk_buff *skb, struct ubuf_info *uarg) /* make sure all noifications can be finished in the same task_work */ if (unlikely(notif->ctx != prev_notif->ctx || - notif->task != prev_notif->task)) + notif->tctx != prev_notif->tctx)) return -EEXIST; nd->head = prev_nd->head; @@ -112,12 +112,14 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx) if (unlikely(!io_alloc_req(ctx, ¬if))) return NULL; + notif->ctx = ctx; notif->opcode = IORING_OP_NOP; notif->flags = 0; notif->file = NULL; - notif->task = current; + notif->tctx = current->io_uring; io_get_task_refs(1); - notif->rsrc_node = NULL; + notif->file_node = NULL; + notif->buf_node = NULL; nd = io_notif_to_data(notif); nd->zc_report = false; diff --git a/io_uring/opdef.c b/io_uring/opdef.c index a2be3bbca5ff..6de6229207a8 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include "io_uring.h" #include "opdef.h" @@ -36,6 +37,7 @@ #include "waitid.h" #include "futex.h" #include "truncate.h" +#include "zcrx.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { @@ -103,7 +105,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_read_fixed, - .issue = io_read, + .issue = io_read_fixed, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -117,7 +119,7 @@ const struct io_issue_def io_issue_defs[] = { .iopoll_queue = 1, .async_size = sizeof(struct io_async_rw), .prep = io_prep_write_fixed, - .issue = io_write, + .issue = io_write_fixed, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -214,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = { }, [IORING_OP_FALLOCATE] = { .needs_file = 1, + .hash_reg_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, @@ -331,13 +334,13 @@ const struct io_issue_def io_issue_defs[] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, - .issue = io_provide_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, - .issue = io_remove_buffers, + .issue = io_manage_buffers_legacy, }, [IORING_OP_TEE] = { .needs_file = 1, @@ -414,7 +417,7 @@ const struct io_issue_def io_issue_defs[] = { .plug = 1, .iopoll = 1, .iopoll_queue = 1, - .async_size = 2 * sizeof(struct io_uring_sqe), + .async_size = sizeof(struct io_async_cmd), .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, @@ -515,6 +518,62 @@ const struct io_issue_def io_issue_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_RECV_ZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .ioprio = 1, +#if defined(CONFIG_NET) + .prep = io_recvzc_prep, + .issue = io_recvzc, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_EPOLL_WAIT] = { + .needs_file = 1, + .audit_skip = 1, + .pollin = 1, +#if defined(CONFIG_EPOLL) + .prep = io_epoll_wait_prep, + .issue = io_epoll_wait, +#else + .prep = io_eopnotsupp_prep, +#endif + }, + [IORING_OP_READV_FIXED] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_readv_fixed, + .issue = io_read, + }, + [IORING_OP_WRITEV_FIXED] = { + .needs_file = 1, + .hash_reg_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .plug = 1, + .audit_skip = 1, + .ioprio = 1, + .iopoll = 1, + .iopoll_queue = 1, + .vectored = 1, + .async_size = sizeof(struct io_async_rw), + .prep = io_prep_writev_fixed, + .issue = io_write, + }, + [IORING_OP_PIPE] = { + .prep = io_pipe_prep, + .issue = io_pipe, + }, }; const struct io_cold_def io_cold_defs[] = { @@ -641,6 +700,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_SPLICE] = { .name = "SPLICE", + .cleanup = io_splice_cleanup, }, [IORING_OP_PROVIDE_BUFFERS] = { .name = "PROVIDE_BUFFERS", @@ -650,6 +710,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_TEE] = { .name = "TEE", + .cleanup = io_splice_cleanup, }, [IORING_OP_SHUTDOWN] = { .name = "SHUTDOWN", @@ -699,6 +760,7 @@ const struct io_cold_def io_cold_defs[] = { }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", + .cleanup = io_uring_cmd_cleanup, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", @@ -742,6 +804,25 @@ const struct io_cold_def io_cold_defs[] = { [IORING_OP_LISTEN] = { .name = "LISTEN", }, + [IORING_OP_RECV_ZC] = { + .name = "RECV_ZC", + }, + [IORING_OP_EPOLL_WAIT] = { + .name = "EPOLL_WAIT", + }, + [IORING_OP_READV_FIXED] = { + .name = "READV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_WRITEV_FIXED] = { + .name = "WRITEV_FIXED", + .cleanup = io_readv_writev_cleanup, + .fail = io_rw_fail, + }, + [IORING_OP_PIPE] = { + .name = "PIPE", + }, }; const char *io_uring_get_opcode(u8 opcode) diff --git a/io_uring/opdef.h b/io_uring/opdef.h index 14456436ff74..719a52104abe 100644 --- a/io_uring/opdef.h +++ b/io_uring/opdef.h @@ -7,6 +7,12 @@ struct io_issue_def { unsigned needs_file : 1; /* should block plug */ unsigned plug : 1; + /* supports ioprio */ + unsigned ioprio : 1; + /* supports iopoll */ + unsigned iopoll : 1; + /* op supports buffer selection */ + unsigned buffer_select : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ @@ -15,14 +21,8 @@ struct io_issue_def { unsigned pollin : 1; unsigned pollout : 1; unsigned poll_exclusive : 1; - /* op supports buffer selection */ - unsigned buffer_select : 1; /* skip auditing */ unsigned audit_skip : 1; - /* supports ioprio */ - unsigned ioprio : 1; - /* supports iopoll */ - unsigned iopoll : 1; /* have to be put into the iopoll list */ unsigned iopoll_queue : 1; /* vectored opcode, set if 1) vectored, and 2) handler needs to know */ diff --git a/io_uring/openclose.c b/io_uring/openclose.c index e3357dfa14ca..83e36ad4e31b 100644 --- a/io_uring/openclose.c +++ b/io_uring/openclose.c @@ -6,6 +6,8 @@ #include <linux/fdtable.h> #include <linux/fsnotify.h> #include <linux/namei.h> +#include <linux/pipe_fs_i.h> +#include <linux/watch_queue.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> @@ -169,7 +171,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_openat(struct io_kiocb *req, unsigned int issue_flags) @@ -257,7 +259,7 @@ err: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -300,5 +302,136 @@ int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; +} + +struct io_pipe { + struct file *file; + int __user *fds; + int flags; + int file_slot; + unsigned long nofile; +}; + +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + + if (sqe->fd || sqe->off || sqe->addr3) + return -EINVAL; + + p->fds = u64_to_user_ptr(READ_ONCE(sqe->addr)); + p->flags = READ_ONCE(sqe->pipe_flags); + if (p->flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) + return -EINVAL; + + p->file_slot = READ_ONCE(sqe->file_index); + p->nofile = rlimit(RLIMIT_NOFILE); + return 0; +} + +static int io_pipe_fixed(struct io_kiocb *req, struct file **files, + unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct io_ring_ctx *ctx = req->ctx; + int ret, fds[2] = { -1, -1 }; + int slot = p->file_slot; + + if (p->flags & O_CLOEXEC) + return -EINVAL; + + io_ring_submit_lock(ctx, issue_flags); + + ret = __io_fixed_fd_install(ctx, files[0], slot); + if (ret < 0) + goto err; + fds[0] = ret; + files[0] = NULL; + + /* + * If a specific slot is given, next one will be used for + * the write side. + */ + if (slot != IORING_FILE_INDEX_ALLOC) + slot++; + + ret = __io_fixed_fd_install(ctx, files[1], slot); + if (ret < 0) + goto err; + fds[1] = ret; + files[1] = NULL; + + io_ring_submit_unlock(ctx, issue_flags); + + if (!copy_to_user(p->fds, fds, sizeof(fds))) + return 0; + + ret = -EFAULT; + io_ring_submit_lock(ctx, issue_flags); +err: + if (fds[0] != -1) + io_fixed_fd_remove(ctx, fds[0]); + if (fds[1] != -1) + io_fixed_fd_remove(ctx, fds[1]); + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} + +static int io_pipe_fd(struct io_kiocb *req, struct file **files) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + int ret, fds[2] = { -1, -1 }; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[0] = ret; + + ret = __get_unused_fd_flags(p->flags, p->nofile); + if (ret < 0) + goto err; + fds[1] = ret; + + if (!copy_to_user(p->fds, fds, sizeof(fds))) { + fd_install(fds[0], files[0]); + fd_install(fds[1], files[1]); + return 0; + } + ret = -EFAULT; +err: + if (fds[0] != -1) + put_unused_fd(fds[0]); + if (fds[1] != -1) + put_unused_fd(fds[1]); + return ret; +} + +int io_pipe(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_pipe *p = io_kiocb_to_cmd(req, struct io_pipe); + struct file *files[2]; + int ret; + + ret = create_pipe_files(files, p->flags); + if (ret) + return ret; + files[0]->f_mode |= FMODE_NOWAIT; + files[1]->f_mode |= FMODE_NOWAIT; + + if (!!p->file_slot) + ret = io_pipe_fixed(req, files, issue_flags); + else + ret = io_pipe_fd(req, files); + + io_req_set_res(req, ret, 0); + if (!ret) + return IOU_COMPLETE; + + req_set_fail(req); + if (files[0]) + fput(files[0]); + if (files[1]) + fput(files[1]); + return ret; } diff --git a/io_uring/openclose.h b/io_uring/openclose.h index 8a93c98ad0ad..4ca2a9935abc 100644 --- a/io_uring/openclose.h +++ b/io_uring/openclose.h @@ -13,5 +13,8 @@ int io_openat2(struct io_kiocb *req, unsigned int issue_flags); int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_close(struct io_kiocb *req, unsigned int issue_flags); +int io_pipe_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_pipe(struct io_kiocb *req, unsigned int issue_flags); + int io_install_fixed_fd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_install_fixed_fd(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/poll.c b/io_uring/poll.c index 1f63b60e85e7..0526062e2f81 100644 --- a/io_uring/poll.c +++ b/io_uring/poll.c @@ -122,53 +122,12 @@ static void io_poll_req_insert(struct io_kiocb *req) { struct io_hash_table *table = &req->ctx->cancel_table; u32 index = hash_long(req->cqe.user_data, table->hash_bits); - struct io_hash_bucket *hb = &table->hbs[index]; - - spin_lock(&hb->lock); - hlist_add_head(&req->hash_node, &hb->list); - spin_unlock(&hb->lock); -} - -static void io_poll_req_delete(struct io_kiocb *req, struct io_ring_ctx *ctx) -{ - struct io_hash_table *table = &req->ctx->cancel_table; - u32 index = hash_long(req->cqe.user_data, table->hash_bits); - spinlock_t *lock = &table->hbs[index].lock; - - spin_lock(lock); - hash_del(&req->hash_node); - spin_unlock(lock); -} - -static void io_poll_req_insert_locked(struct io_kiocb *req) -{ - struct io_hash_table *table = &req->ctx->cancel_table_locked; - u32 index = hash_long(req->cqe.user_data, table->hash_bits); lockdep_assert_held(&req->ctx->uring_lock); hlist_add_head(&req->hash_node, &table->hbs[index].list); } -static void io_poll_tw_hash_eject(struct io_kiocb *req, struct io_tw_state *ts) -{ - struct io_ring_ctx *ctx = req->ctx; - - if (req->flags & REQ_F_HASH_LOCKED) { - /* - * ->cancel_table_locked is protected by ->uring_lock in - * contrast to per bucket spinlocks. Likely, tctx_task_work() - * already grabbed the mutex for us, but there is a chance it - * failed. - */ - io_tw_lock(ctx, ts); - hash_del(&req->hash_node); - req->flags &= ~REQ_F_HASH_LOCKED; - } else { - io_poll_req_delete(req, ctx); - } -} - static void io_init_poll_iocb(struct io_poll *poll, __poll_t events) { poll->head = NULL; @@ -261,12 +220,11 @@ static inline void io_poll_execute(struct io_kiocb *req, int res) * req->cqe.res. IOU_POLL_REMOVE_POLL_USE_RES indicates to remove multishot * poll and that the result is stored in req->cqe. */ -static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) +static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw) { int v; - /* req->task == current here, checking PF_EXITING is safe */ - if (unlikely(req->task->flags & PF_EXITING)) + if (unlikely(io_should_terminate_tw())) return -ECANCELED; do { @@ -315,6 +273,8 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REISSUE; } } + if (unlikely(req->cqe.res & EPOLLERR)) + req_set_fail(req); if (req->apoll_events & EPOLLONESHOT) return IOU_POLL_DONE; @@ -328,12 +288,13 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_REMOVE_POLL_USE_RES; } } else { - int ret = io_poll_issue(req, ts); - if (ret == IOU_STOP_MULTISHOT) + int ret = io_poll_issue(req, tw); + + if (ret == IOU_COMPLETE) return IOU_POLL_REMOVE_POLL_USE_RES; else if (ret == IOU_REQUEUE) return IOU_POLL_REQUEUE; - if (ret < 0) + if (ret != IOU_RETRY && ret < 0) return ret; } @@ -351,19 +312,22 @@ static int io_poll_check_events(struct io_kiocb *req, struct io_tw_state *ts) return IOU_POLL_NO_ACTION; } -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw) { int ret; - ret = io_poll_check_events(req, ts); + ret = io_poll_check_events(req, tw); if (ret == IOU_POLL_NO_ACTION) { + io_kbuf_recycle(req, 0); return; } else if (ret == IOU_POLL_REQUEUE) { + io_kbuf_recycle(req, 0); __io_poll_execute(req, 0); return; } io_poll_remove_entries(req); - io_poll_tw_hash_eject(req, ts); + /* task_work always has ->uring_lock held */ + hash_del(&req->hash_node); if (req->opcode == IORING_OP_POLL_ADD) { if (ret == IOU_POLL_DONE) { @@ -372,7 +336,7 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) poll = io_kiocb_to_cmd(req, struct io_poll); req->cqe.res = mangle_poll(req->cqe.res & poll->events); } else if (ret == IOU_POLL_REISSUE) { - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); return; } else if (ret != IOU_POLL_REMOVE_POLL_USE_RES) { req->cqe.res = ret; @@ -380,14 +344,14 @@ void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts) } io_req_set_res(req, req->cqe.res, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } else { - io_tw_lock(req->ctx, ts); + io_tw_lock(req->ctx, tw); if (ret == IOU_POLL_REMOVE_POLL_USE_RES) - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); else if (ret == IOU_POLL_DONE || ret == IOU_POLL_REISSUE) - io_req_task_submit(req, ts); + io_req_task_submit(req, tw); else io_req_defer_failed(req, ret); } @@ -563,12 +527,13 @@ static bool io_poll_can_finish_inline(struct io_kiocb *req, return pt->owning || io_poll_get_ownership(req); } -static void io_poll_add_hash(struct io_kiocb *req) +static void io_poll_add_hash(struct io_kiocb *req, unsigned int issue_flags) { - if (req->flags & REQ_F_HASH_LOCKED) - io_poll_req_insert_locked(req); - else - io_poll_req_insert(req); + struct io_ring_ctx *ctx = req->ctx; + + io_ring_submit_lock(ctx, issue_flags); + io_poll_req_insert(req); + io_ring_submit_unlock(ctx, issue_flags); } /* @@ -605,11 +570,6 @@ static int __io_arm_poll_handler(struct io_kiocb *req, ipt->owning = issue_flags & IO_URING_F_UNLOCKED; atomic_set(&req->poll_refs, (int)ipt->owning); - /* io-wq doesn't hold uring_lock */ - if (issue_flags & IO_URING_F_UNLOCKED) - req->flags &= ~REQ_F_HASH_LOCKED; - - /* * Exclusive waits may only wake a limited amount of entries * rather than all of them, this may interfere with lazy @@ -638,7 +598,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, if (mask && ((poll->events & (EPOLLET|EPOLLONESHOT)) == (EPOLLET|EPOLLONESHOT))) { if (!io_poll_can_finish_inline(req, ipt)) { - io_poll_add_hash(req); + io_poll_add_hash(req, issue_flags); return 0; } io_poll_remove_entries(req); @@ -647,7 +607,7 @@ static int __io_arm_poll_handler(struct io_kiocb *req, return 1; } - io_poll_add_hash(req); + io_poll_add_hash(req, issue_flags); if (mask && (poll->events & EPOLLET) && io_poll_can_finish_inline(req, ipt)) { @@ -693,15 +653,12 @@ static struct async_poll *io_req_alloc_apoll(struct io_kiocb *req, if (req->flags & REQ_F_POLLED) { apoll = req->apoll; kfree(apoll->double_poll); - } else if (!(issue_flags & IO_URING_F_UNLOCKED)) { - apoll = io_alloc_cache_get(&ctx->apoll_cache); - if (!apoll) - goto alloc_apoll; - apoll->poll.retries = APOLL_MAX_RETRY; } else { -alloc_apoll: - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); - if (unlikely(!apoll)) + if (!(issue_flags & IO_URING_F_UNLOCKED)) + apoll = io_cache_alloc(&ctx->apoll_cache, GFP_ATOMIC); + else + apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (!apoll) return NULL; apoll->poll.retries = APOLL_MAX_RETRY; } @@ -720,12 +677,6 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) __poll_t mask = POLLPRI | POLLERR | EPOLLET; int ret; - /* - * apoll requests already grab the mutex to complete in the tw handler, - * so removal from the mutex-backed hash is free, use it by default. - */ - req->flags |= REQ_F_HASH_LOCKED; - if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; if (!io_file_can_poll(req)) @@ -761,58 +712,41 @@ int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags) return IO_APOLL_OK; } -static __cold bool io_poll_remove_all_table(struct task_struct *tsk, - struct io_hash_table *table, - bool cancel_all) +/* + * Returns true if we found and killed one or more poll requests + */ +__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, + bool cancel_all) { - unsigned nr_buckets = 1U << table->hash_bits; + unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct hlist_node *tmp; struct io_kiocb *req; bool found = false; int i; + lockdep_assert_held(&ctx->uring_lock); + for (i = 0; i < nr_buckets; i++) { - struct io_hash_bucket *hb = &table->hbs[i]; + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - spin_lock(&hb->lock); hlist_for_each_entry_safe(req, tmp, &hb->list, hash_node) { - if (io_match_task_safe(req, tsk, cancel_all)) { + if (io_match_task_safe(req, tctx, cancel_all)) { hlist_del_init(&req->hash_node); io_poll_cancel_req(req); found = true; } } - spin_unlock(&hb->lock); } return found; } -/* - * Returns true if we found and killed one or more poll requests - */ -__cold bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) - __must_hold(&ctx->uring_lock) -{ - bool ret; - - ret = io_poll_remove_all_table(tsk, &ctx->cancel_table, cancel_all); - ret |= io_poll_remove_all_table(tsk, &ctx->cancel_table_locked, cancel_all); - return ret; -} - static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, - struct io_cancel_data *cd, - struct io_hash_table *table, - struct io_hash_bucket **out_bucket) + struct io_cancel_data *cd) { struct io_kiocb *req; - u32 index = hash_long(cd->data, table->hash_bits); - struct io_hash_bucket *hb = &table->hbs[index]; - - *out_bucket = NULL; + u32 index = hash_long(cd->data, ctx->cancel_table.hash_bits); + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[index]; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { if (cd->data != req->cqe.user_data) continue; @@ -822,35 +756,25 @@ static struct io_kiocb *io_poll_find(struct io_ring_ctx *ctx, bool poll_only, if (io_cancel_match_sequence(req, cd->seq)) continue; } - *out_bucket = hb; return req; } - spin_unlock(&hb->lock); return NULL; } static struct io_kiocb *io_poll_file_find(struct io_ring_ctx *ctx, - struct io_cancel_data *cd, - struct io_hash_table *table, - struct io_hash_bucket **out_bucket) + struct io_cancel_data *cd) { - unsigned nr_buckets = 1U << table->hash_bits; + unsigned nr_buckets = 1U << ctx->cancel_table.hash_bits; struct io_kiocb *req; int i; - *out_bucket = NULL; - for (i = 0; i < nr_buckets; i++) { - struct io_hash_bucket *hb = &table->hbs[i]; + struct io_hash_bucket *hb = &ctx->cancel_table.hbs[i]; - spin_lock(&hb->lock); hlist_for_each_entry(req, &hb->list, hash_node) { - if (io_cancel_req_match(req, cd)) { - *out_bucket = hb; + if (io_cancel_req_match(req, cd)) return req; - } } - spin_unlock(&hb->lock); } return NULL; } @@ -866,23 +790,21 @@ static int io_poll_disarm(struct io_kiocb *req) return 0; } -static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, - struct io_hash_table *table) +static int __io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { - struct io_hash_bucket *bucket; struct io_kiocb *req; if (cd->flags & (IORING_ASYNC_CANCEL_FD | IORING_ASYNC_CANCEL_OP | IORING_ASYNC_CANCEL_ANY)) - req = io_poll_file_find(ctx, cd, table, &bucket); + req = io_poll_file_find(ctx, cd); else - req = io_poll_find(ctx, false, cd, table, &bucket); + req = io_poll_find(ctx, false, cd); - if (req) + if (req) { io_poll_cancel_req(req); - if (bucket) - spin_unlock(&bucket->lock); - return req ? 0 : -ENOENT; + return 0; + } + return -ENOENT; } int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, @@ -890,12 +812,8 @@ int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, { int ret; - ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table); - if (ret != -ENOENT) - return ret; - io_ring_submit_lock(ctx, issue_flags); - ret = __io_poll_cancel(ctx, cd, &ctx->cancel_table_locked); + ret = __io_poll_cancel(ctx, cd); io_ring_submit_unlock(ctx, issue_flags); return ret; } @@ -972,17 +890,10 @@ int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.pt._qproc = io_poll_queue_proc; - /* - * If sqpoll or single issuer, there is no contention for ->uring_lock - * and we'll end up holding it in tw handlers anyway. - */ - if (req->ctx->flags & (IORING_SETUP_SQPOLL|IORING_SETUP_SINGLE_ISSUER)) - req->flags |= REQ_F_HASH_LOCKED; - ret = __io_arm_poll_handler(req, poll, &ipt, poll->events, issue_flags); if (ret > 0) { io_req_set_res(req, ipt.result_mask, 0); - return IOU_OK; + return IOU_COMPLETE; } return ret ?: IOU_ISSUE_SKIP_COMPLETE; } @@ -992,32 +903,16 @@ int io_poll_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_poll_update *poll_update = io_kiocb_to_cmd(req, struct io_poll_update); struct io_ring_ctx *ctx = req->ctx; struct io_cancel_data cd = { .ctx = ctx, .data = poll_update->old_user_data, }; - struct io_hash_bucket *bucket; struct io_kiocb *preq; int ret2, ret = 0; io_ring_submit_lock(ctx, issue_flags); - preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table, &bucket); - ret2 = io_poll_disarm(preq); - if (bucket) - spin_unlock(&bucket->lock); - if (!ret2) - goto found; - if (ret2 != -ENOENT) { - ret = ret2; - goto out; - } - - preq = io_poll_find(ctx, true, &cd, &ctx->cancel_table_locked, &bucket); + preq = io_poll_find(ctx, true, &cd); ret2 = io_poll_disarm(preq); - if (bucket) - spin_unlock(&bucket->lock); if (ret2) { ret = ret2; goto out; } - -found: if (WARN_ON_ONCE(preq->opcode != IORING_OP_POLL_ADD)) { ret = -EFAULT; goto out; @@ -1053,5 +948,5 @@ out: } /* complete update request, we're done with it */ io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/poll.h b/io_uring/poll.h index b0e3745f5a29..27e2db2ed4ae 100644 --- a/io_uring/poll.h +++ b/io_uring/poll.h @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> + #define IO_POLL_ALLOC_CACHE_MAX 32 enum { @@ -40,7 +42,7 @@ struct io_cancel_data; int io_poll_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned issue_flags); int io_arm_poll_handler(struct io_kiocb *req, unsigned issue_flags); -bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, +bool io_poll_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); -void io_poll_task_func(struct io_kiocb *req, struct io_tw_state *ts); +void io_poll_task_func(struct io_kiocb *req, io_tw_token_t tw); diff --git a/io_uring/refs.h b/io_uring/refs.h index 63982ead9f7d..0d928d87c4ed 100644 --- a/io_uring/refs.h +++ b/io_uring/refs.h @@ -17,6 +17,13 @@ static inline bool req_ref_inc_not_zero(struct io_kiocb *req) return atomic_inc_not_zero(&req->refs); } +static inline bool req_ref_put_and_test_atomic(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(data_race(req->flags) & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + static inline bool req_ref_put_and_test(struct io_kiocb *req) { if (likely(!(req->flags & REQ_F_REFCOUNT))) diff --git a/io_uring/register.c b/io_uring/register.c index eca26d4884d9..a59589249fce 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -28,6 +28,9 @@ #include "kbuf.h" #include "napi.h" #include "eventfd.h" +#include "msg_ring.h" +#include "memmap.h" +#include "zcrx.h" #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) @@ -102,21 +105,13 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static __cold int io_register_restrictions(struct io_ring_ctx *ctx, - void __user *arg, unsigned int nr_args) +static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args, + struct io_restriction *restrictions) { struct io_uring_restriction *res; size_t size; int i, ret; - /* Restrictions allowed only if rings started disabled */ - if (!(ctx->flags & IORING_SETUP_R_DISABLED)) - return -EBADFD; - - /* We allow only a single restrictions registration */ - if (ctx->restrictions.registered) - return -EBUSY; - if (!arg || nr_args > IORING_MAX_RESTRICTIONS) return -EINVAL; @@ -128,47 +123,57 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx, if (IS_ERR(res)) return PTR_ERR(res); - ret = 0; + ret = -EINVAL; for (i = 0; i < nr_args; i++) { switch (res[i].opcode) { case IORING_RESTRICTION_REGISTER_OP: - if (res[i].register_op >= IORING_REGISTER_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].register_op, - ctx->restrictions.register_op); + if (res[i].register_op >= IORING_REGISTER_LAST) + goto err; + __set_bit(res[i].register_op, restrictions->register_op); break; case IORING_RESTRICTION_SQE_OP: - if (res[i].sqe_op >= IORING_OP_LAST) { - ret = -EINVAL; - goto out; - } - - __set_bit(res[i].sqe_op, ctx->restrictions.sqe_op); + if (res[i].sqe_op >= IORING_OP_LAST) + goto err; + __set_bit(res[i].sqe_op, restrictions->sqe_op); break; case IORING_RESTRICTION_SQE_FLAGS_ALLOWED: - ctx->restrictions.sqe_flags_allowed = res[i].sqe_flags; + restrictions->sqe_flags_allowed = res[i].sqe_flags; break; case IORING_RESTRICTION_SQE_FLAGS_REQUIRED: - ctx->restrictions.sqe_flags_required = res[i].sqe_flags; + restrictions->sqe_flags_required = res[i].sqe_flags; break; default: - ret = -EINVAL; - goto out; + goto err; } } -out: + ret = 0; + +err: + kfree(res); + return ret; +} + +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) +{ + int ret; + + /* Restrictions allowed only if rings started disabled */ + if (!(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EBADFD; + + /* We allow only a single restrictions registration */ + if (ctx->restrictions.registered) + return -EBUSY; + + ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions); /* Reset all restrictions if an error happened */ if (ret != 0) memset(&ctx->restrictions, 0, sizeof(ctx->restrictions)); else ctx->restrictions.registered = true; - - kfree(res); return ret; } @@ -268,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, if (ctx->flags & IORING_SETUP_SQPOLL) { sqd = ctx->sq_data; if (sqd) { + struct task_struct *tsk; + /* * Observe the correct sqd->lock -> ctx->uring_lock * ordering. Fine to drop uring_lock here, we hold @@ -277,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - if (sqd->thread) - tctx = sqd->thread->io_uring; + tsk = sqpoll_task_locked(sqd); + if (tsk) + tctx = tsk->io_uring; } } else { tctx = current->io_uring; @@ -360,6 +368,267 @@ static int io_register_clock(struct io_ring_ctx *ctx, return 0; } +/* + * State to maintain until we can swap. Both new and old state, used for + * either mapping or freeing. + */ +struct io_ring_ctx_rings { + struct io_rings *rings; + struct io_uring_sqe *sq_sqes; + + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; +}; + +static void io_register_free_rings(struct io_ring_ctx *ctx, + struct io_uring_params *p, + struct io_ring_ctx_rings *r) +{ + io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); +} + +#define swap_old(ctx, o, n, field) \ + do { \ + (o).field = (ctx)->field; \ + (ctx)->field = (n).field; \ + } while (0) + +#define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP) +#define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \ + IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP) + +static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_region_desc rd; + struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; + size_t size, sq_array_offset; + unsigned i, tail, old_head; + struct io_uring_params p; + int ret; + + /* for single issuer, must be owner resizing */ + if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && + current != ctx->submitter_task) + return -EEXIST; + /* limited to DEFER_TASKRUN for now */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + if (p.flags & ~RESIZE_FLAGS) + return -EINVAL; + + /* properties that are always inherited */ + p.flags |= (ctx->flags & COPY_FLAGS); + + ret = io_uring_fill_params(p.sq_entries, &p); + if (unlikely(ret)) + return ret; + + /* nothing to do, but copy params back */ + if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) { + if (copy_to_user(arg, &p, sizeof(p))) + return -EFAULT; + return 0; + } + + size = rings_size(p.flags, p.sq_entries, p.cq_entries, + &sq_array_offset); + if (size == SIZE_MAX) + return -EOVERFLOW; + + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); + + /* + * At this point n.rings is shared with userspace, just like o.rings + * is as well. While we don't expect userspace to modify it while + * a resize is in progress, and it's most likely that userspace will + * shoot itself in the foot if it does, we can't always assume good + * intent... Use read/write once helpers from here on to indicate the + * shared nature of it. + */ + WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1); + WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1); + WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries); + WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries); + + if (copy_to_user(arg, &p, sizeof(p))) { + io_register_free_rings(ctx, &p, &n); + return -EFAULT; + } + + if (p.flags & IORING_SETUP_SQE128) + size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries); + else + size = array_size(sizeof(struct io_uring_sqe), p.sq_entries); + if (size == SIZE_MAX) { + io_register_free_rings(ctx, &p, &n); + return -EOVERFLOW; + } + + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.sq_sqes = io_region_get_ptr(&n.sq_region); + + /* + * If using SQPOLL, park the thread + */ + if (ctx->sq_data) { + mutex_unlock(&ctx->uring_lock); + io_sq_thread_park(ctx->sq_data); + mutex_lock(&ctx->uring_lock); + } + + /* + * We'll do the swap. Grab the ctx->mmap_lock, which will exclude + * any new mmap's on the ring fd. Clear out existing mappings to prevent + * mmap from seeing them, as we'll unmap them. Any attempt to mmap + * existing rings beyond this point will fail. Not that it could proceed + * at this point anyway, as the io_uring mmap side needs go grab the + * ctx->mmap_lock as well. Likewise, hold the completion lock over the + * duration of the actual swap. + */ + mutex_lock(&ctx->mmap_lock); + spin_lock(&ctx->completion_lock); + o.rings = ctx->rings; + ctx->rings = NULL; + o.sq_sqes = ctx->sq_sqes; + ctx->sq_sqes = NULL; + + /* + * Now copy SQ and CQ entries, if any. If either of the destination + * rings can't hold what is already there, then fail the operation. + */ + tail = READ_ONCE(o.rings->sq.tail); + old_head = READ_ONCE(o.rings->sq.head); + if (tail - old_head > p.sq_entries) + goto overflow; + for (i = old_head; i < tail; i++) { + unsigned src_head = i & (ctx->sq_entries - 1); + unsigned dst_head = i & (p.sq_entries - 1); + + n.sq_sqes[dst_head] = o.sq_sqes[src_head]; + } + WRITE_ONCE(n.rings->sq.head, old_head); + WRITE_ONCE(n.rings->sq.tail, tail); + + tail = READ_ONCE(o.rings->cq.tail); + old_head = READ_ONCE(o.rings->cq.head); + if (tail - old_head > p.cq_entries) { +overflow: + /* restore old rings, and return -EOVERFLOW via cleanup path */ + ctx->rings = o.rings; + ctx->sq_sqes = o.sq_sqes; + to_free = &n; + ret = -EOVERFLOW; + goto out; + } + for (i = old_head; i < tail; i++) { + unsigned src_head = i & (ctx->cq_entries - 1); + unsigned dst_head = i & (p.cq_entries - 1); + + n.rings->cqes[dst_head] = o.rings->cqes[src_head]; + } + WRITE_ONCE(n.rings->cq.head, old_head); + WRITE_ONCE(n.rings->cq.tail, tail); + /* invalidate cached cqe refill */ + ctx->cqe_cached = ctx->cqe_sentinel = NULL; + + WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped)); + atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags)); + WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags)); + WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow)); + + /* all done, store old pointers and assign new ones */ + if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) + ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset); + + ctx->sq_entries = p.sq_entries; + ctx->cq_entries = p.cq_entries; + + ctx->rings = n.rings; + ctx->sq_sqes = n.sq_sqes; + swap_old(ctx, o, n, ring_region); + swap_old(ctx, o, n, sq_region); + to_free = &o; + ret = 0; +out: + spin_unlock(&ctx->completion_lock); + mutex_unlock(&ctx->mmap_lock); + io_register_free_rings(ctx, &p, to_free); + + if (ctx->sq_data) + io_sq_thread_unpark(ctx->sq_data); + + return ret; +} + +static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg) +{ + struct io_uring_mem_region_reg __user *reg_uptr = uarg; + struct io_uring_mem_region_reg reg; + struct io_uring_region_desc __user *rd_uptr; + struct io_uring_region_desc rd; + int ret; + + if (io_region_is_set(&ctx->param_region)) + return -EBUSY; + if (copy_from_user(®, reg_uptr, sizeof(reg))) + return -EFAULT; + rd_uptr = u64_to_user_ptr(reg.region_uptr); + if (copy_from_user(&rd, rd_uptr, sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + return -EINVAL; + if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG) + return -EINVAL; + + /* + * This ensures there are no waiters. Waiters are unlocked and it's + * hard to synchronise with them, especially if we need to initialise + * the region. + */ + if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) && + !(ctx->flags & IORING_SETUP_R_DISABLED)) + return -EINVAL; + + ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd, + IORING_MAP_OFF_PARAM_REGION); + if (ret) + return ret; + if (copy_to_user(rd_uptr, &rd, sizeof(rd))) { + io_free_region(ctx, &ctx->param_region); + return -EFAULT; + } + + if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) { + ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region); + ctx->cq_wait_size = rd.size; + } + return 0; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -548,6 +817,24 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_register_clone_buffers(ctx, arg); break; + case IORING_REGISTER_ZCRX_IFQ: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_zcrx_ifq(ctx, arg); + break; + case IORING_REGISTER_RESIZE_RINGS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_resize_rings(ctx, arg); + break; + case IORING_REGISTER_MEM_REGION: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_mem_region(ctx, arg); + break; default: ret = -EINVAL; break; @@ -576,6 +863,8 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EINVAL); fd = array_index_nospec(fd, IO_RINGFD_REG_MAX); file = tctx->registered_rings[fd]; + if (file) + get_file(file); } else { file = fget(fd); } @@ -588,6 +877,32 @@ struct file *io_uring_register_get_file(unsigned int fd, bool registered) return ERR_PTR(-EOPNOTSUPP); } +/* + * "blind" registration opcodes are ones where there's no ring given, and + * hence the source fd must be -1. + */ +static int io_uring_register_blind(unsigned int opcode, void __user *arg, + unsigned int nr_args) +{ + switch (opcode) { + case IORING_REGISTER_SEND_MSG_RING: { + struct io_uring_sqe sqe; + + if (!arg || nr_args != 1) + return -EINVAL; + if (copy_from_user(&sqe, arg, sizeof(sqe))) + return -EFAULT; + /* no flags supported */ + if (sqe.flags) + return -EINVAL; + if (sqe.opcode == IORING_OP_MSG_RING) + return io_uring_sync_msg_ring(&sqe); + } + } + + return -EINVAL; +} + SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, void __user *, arg, unsigned int, nr_args) { @@ -602,6 +917,9 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, if (opcode >= IORING_REGISTER_LAST) return -EINVAL; + if (fd == -1) + return io_uring_register_blind(opcode, arg, nr_args); + file = io_uring_register_get_file(fd, use_registered_ring); if (IS_ERR(file)) return PTR_ERR(file); @@ -609,9 +927,11 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); + + trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr, + ctx->buf_table.nr, ret); mutex_unlock(&ctx->uring_lock); - trace_io_uring_register(ctx, opcode, ctx->nr_user_files, ctx->nr_user_bufs, ret); - if (!use_registered_ring) - fput(file); + + fput(file); return ret; } diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 6f3b6de230bd..f2b31fb68992 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -9,11 +9,11 @@ #include <linux/hugetlb.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/io_uring/cmd.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" -#include "alloc_cache.h" #include "openclose.h" #include "rsrc.h" #include "memmap.h" @@ -26,20 +26,14 @@ struct io_rsrc_update { u32 offset; }; -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage); +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, struct page **last_hpage); /* only define max */ #define IORING_MAX_FIXED_FILES (1U << 20) #define IORING_MAX_REG_BUFFERS (1U << 14) -static const struct io_mapped_ubuf dummy_ubuf = { - /* set invalid range, so io_import_fixed() fails meeting it */ - .ubuf = -1UL, - .len = UINT_MAX, -}; +#define IO_CACHED_BVECS_SEGS 32 int __io_account_mem(struct user_struct *user, unsigned long nr_pages) { @@ -86,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) return 0; } -static int io_buffer_validate(struct iovec *iov) +int io_validate_user_buf_range(u64 uaddr, u64 ulen) { - unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1); + unsigned long tmp, base = (unsigned long)uaddr; + unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen); + /* arbitrary limit, but we need something */ + if (ulen > SZ_1G || !ulen) + return -EFAULT; + if (check_add_overflow(base, acct_len, &tmp)) + return -EOVERFLOW; + return 0; +} + +static int io_buffer_validate(struct iovec *iov) +{ /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is @@ -97,235 +102,120 @@ static int io_buffer_validate(struct iovec *iov) */ if (!iov->iov_base) return iov->iov_len ? -EFAULT : 0; - if (!iov->iov_len) - return -EFAULT; - - /* arbitrary limit, but we need something */ - if (iov->iov_len > SZ_1G) - return -EFAULT; - - if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp)) - return -EOVERFLOW; - return 0; + return io_validate_user_buf_range((unsigned long)iov->iov_base, + iov->iov_len); } -static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slot) +static void io_release_ubuf(void *priv) { - struct io_mapped_ubuf *imu = *slot; + struct io_mapped_ubuf *imu = priv; unsigned int i; - *slot = NULL; - if (imu != &dummy_ubuf) { - if (!refcount_dec_and_test(&imu->refs)) - return; - for (i = 0; i < imu->nr_bvecs; i++) - unpin_user_page(imu->bvec[i].bv_page); - if (imu->acct_pages) - io_unaccount_mem(ctx, imu->acct_pages); - kvfree(imu); - } -} + for (i = 0; i < imu->nr_bvecs; i++) { + struct folio *folio = page_folio(imu->bvec[i].bv_page); -static void io_rsrc_put_work(struct io_rsrc_node *node) -{ - struct io_rsrc_put *prsrc = &node->item; - - if (prsrc->tag) - io_post_aux_cqe(node->ctx, prsrc->tag, 0, 0); - - switch (node->type) { - case IORING_RSRC_FILE: - fput(prsrc->file); - break; - case IORING_RSRC_BUFFER: - io_rsrc_buf_put(node->ctx, prsrc); - break; - default: - WARN_ON_ONCE(1); - break; + unpin_user_folio(folio, 1); } } -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *node) +static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx, + int nr_bvecs) { - if (!io_alloc_cache_put(&ctx->rsrc_node_cache, node)) - kfree(node); + if (nr_bvecs <= IO_CACHED_BVECS_SEGS) + return io_cache_alloc(&ctx->imu_cache, GFP_KERNEL); + return kvmalloc(struct_size_t(struct io_mapped_ubuf, bvec, nr_bvecs), + GFP_KERNEL); } -void io_rsrc_node_ref_zero(struct io_rsrc_node *node) - __must_hold(&node->ctx->uring_lock) +static void io_free_imu(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - struct io_ring_ctx *ctx = node->ctx; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (node->refs) - break; - list_del(&node->node); - - if (likely(!node->empty)) - io_rsrc_put_work(node); - io_rsrc_node_destroy(ctx, node); - } - if (list_empty(&ctx->rsrc_ref_list) && unlikely(ctx->rsrc_quiesce)) - wake_up_all(&ctx->rsrc_quiesce_wq); + if (imu->nr_bvecs <= IO_CACHED_BVECS_SEGS) + io_cache_free(&ctx->imu_cache, imu); + else + kvfree(imu); } -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) +static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf *imu) { - struct io_rsrc_node *ref_node; - - ref_node = io_alloc_cache_get(&ctx->rsrc_node_cache); - if (!ref_node) { - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - } + if (!refcount_dec_and_test(&imu->refs)) + return; - ref_node->ctx = ctx; - ref_node->empty = 0; - ref_node->refs = 1; - return ref_node; + if (imu->acct_pages) + io_unaccount_mem(ctx, imu->acct_pages); + imu->release(imu->priv); + io_free_imu(ctx, imu); } -__cold static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, - struct io_ring_ctx *ctx) +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type) { - struct io_rsrc_node *backup; - DEFINE_WAIT(we); - int ret; - - /* As We may drop ->uring_lock, other task may have started quiesce */ - if (data->quiesce) - return -ENXIO; - - backup = io_rsrc_node_alloc(ctx); - if (!backup) - return -ENOMEM; - ctx->rsrc_node->empty = true; - ctx->rsrc_node->type = -1; - list_add_tail(&ctx->rsrc_node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, ctx->rsrc_node); - ctx->rsrc_node = backup; - - if (list_empty(&ctx->rsrc_ref_list)) - return 0; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 1); - smp_mb(); + struct io_rsrc_node *node; + + node = io_cache_alloc(&ctx->node_cache, GFP_KERNEL); + if (node) { + node->type = type; + node->refs = 1; + node->tag = 0; + node->file_ptr = 0; } - - ctx->rsrc_quiesce++; - data->quiesce = true; - do { - prepare_to_wait(&ctx->rsrc_quiesce_wq, &we, TASK_INTERRUPTIBLE); - mutex_unlock(&ctx->uring_lock); - - ret = io_run_task_work_sig(ctx); - if (ret < 0) { - finish_wait(&ctx->rsrc_quiesce_wq, &we); - mutex_lock(&ctx->uring_lock); - if (list_empty(&ctx->rsrc_ref_list)) - ret = 0; - break; - } - - schedule(); - mutex_lock(&ctx->uring_lock); - ret = 0; - } while (!list_empty(&ctx->rsrc_ref_list)); - - finish_wait(&ctx->rsrc_quiesce_wq, &we); - data->quiesce = false; - ctx->rsrc_quiesce--; - - if (ctx->flags & IORING_SETUP_DEFER_TASKRUN) { - atomic_set(&ctx->cq_wait_nr, 0); - smp_mb(); - } - return ret; + return node; } -static void io_free_page_table(void **table, size_t size) +bool io_rsrc_cache_init(struct io_ring_ctx *ctx) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - - for (i = 0; i < nr_tables; i++) - kfree(table[i]); - kfree(table); + const int imu_cache_size = struct_size_t(struct io_mapped_ubuf, bvec, + IO_CACHED_BVECS_SEGS); + const int node_size = sizeof(struct io_rsrc_node); + bool ret; + + ret = io_alloc_cache_init(&ctx->node_cache, IO_ALLOC_CACHE_MAX, + node_size, 0); + ret |= io_alloc_cache_init(&ctx->imu_cache, IO_ALLOC_CACHE_MAX, + imu_cache_size, 0); + return ret; } -static void io_rsrc_data_free(struct io_rsrc_data *data) +void io_rsrc_cache_free(struct io_ring_ctx *ctx) { - size_t size = data->nr * sizeof(data->tags[0][0]); - - if (data->tags) - io_free_page_table((void **)data->tags, size); - kfree(data); + io_alloc_cache_free(&ctx->node_cache, kfree); + io_alloc_cache_free(&ctx->imu_cache, kfree); } -static __cold void **io_alloc_page_table(size_t size) +static void io_clear_table_tags(struct io_rsrc_data *data) { - unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); - size_t init_size = size; - void **table; - - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); - if (!table) - return NULL; + int i; - for (i = 0; i < nr_tables; i++) { - unsigned int this_size = min_t(size_t, size, PAGE_SIZE); + for (i = 0; i < data->nr; i++) { + struct io_rsrc_node *node = data->nodes[i]; - table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); - if (!table[i]) { - io_free_page_table(table, init_size); - return NULL; - } - size -= this_size; + if (node) + node->tag = 0; } - return table; } -__cold static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, int type, - u64 __user *utags, - unsigned nr, struct io_rsrc_data **pdata) +__cold void io_rsrc_data_free(struct io_ring_ctx *ctx, + struct io_rsrc_data *data) { - struct io_rsrc_data *data; - int ret = 0; - unsigned i; - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - data->tags = (u64 **)io_alloc_page_table(nr * sizeof(data->tags[0][0])); - if (!data->tags) { - kfree(data); - return -ENOMEM; + if (!data->nr) + return; + while (data->nr--) { + if (data->nodes[data->nr]) + io_put_rsrc_node(ctx, data->nodes[data->nr]); } + kvfree(data->nodes); + data->nodes = NULL; + data->nr = 0; +} - data->nr = nr; - data->ctx = ctx; - data->rsrc_type = type; - if (utags) { - ret = -EFAULT; - for (i = 0; i < nr; i++) { - u64 *tag_slot = io_get_tag_slot(data, i); - - if (copy_from_user(tag_slot, &utags[i], - sizeof(*tag_slot))) - goto fail; - } +__cold int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr) +{ + data->nodes = kvmalloc_array(nr, sizeof(struct io_rsrc_node *), + GFP_KERNEL_ACCOUNT | __GFP_ZERO); + if (data->nodes) { + data->nr = nr; + return 0; } - *pdata = data; - return 0; -fail: - io_rsrc_data_free(data); - return ret; + return -ENOMEM; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -334,14 +224,12 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, { u64 __user *tags = u64_to_user_ptr(up->tags); __s32 __user *fds = u64_to_user_ptr(up->data); - struct io_rsrc_data *data = ctx->file_data; - struct io_fixed_file *file_slot; int fd, i, err = 0; unsigned int done; - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_files) + if (up->offset + nr_args > ctx->file_table.data.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { @@ -359,19 +247,13 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, if (fd == IORING_REGISTER_FILES_SKIP) continue; - i = array_index_nospec(up->offset + done, ctx->nr_user_files); - file_slot = io_fixed_file_slot(&ctx->file_table, i); - - if (file_slot->file_ptr) { - err = io_queue_rsrc_removal(data, i, - io_slot_file(file_slot)); - if (err) - break; - file_slot->file_ptr = 0; + i = up->offset + done; + if (io_reset_rsrc_node(ctx, &ctx->file_table.data, i)) io_file_bitmap_clear(&ctx->file_table, i); - } + if (fd != -1) { struct file *file = fget(fd); + struct io_rsrc_node *node; if (!file) { err = -EBADF; @@ -385,8 +267,16 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, err = -EBADF; break; } - *io_get_tag_slot(data, i) = tag; - io_fixed_file_set(file_slot, file); + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + err = -ENOMEM; + fput(file); + break; + } + ctx->file_table.data.nodes[i] = node; + if (tag) + node->tag = tag; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } } @@ -405,13 +295,13 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, __u32 done; int i, err; - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - if (up->offset + nr_args > ctx->nr_user_bufs) + if (up->offset + nr_args > ctx->buf_table.nr) return -EINVAL; for (done = 0; done < nr_args; done++) { - struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; u64 tag = 0; uvec = u64_to_user_ptr(user_data); @@ -427,27 +317,21 @@ static int __io_sqe_buffers_update(struct io_ring_ctx *ctx, err = io_buffer_validate(iov); if (err) break; - if (!iov->iov_base && tag) { - err = -EINVAL; + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + err = PTR_ERR(node); break; } - err = io_sqe_buffer_register(ctx, iov, &imu, &last_hpage); - if (err) - break; - - i = array_index_nospec(up->offset + done, ctx->nr_user_bufs); - if (ctx->user_bufs[i] != &dummy_ubuf) { - err = io_queue_rsrc_removal(ctx->buf_data, i, - ctx->user_bufs[i]); - if (unlikely(err)) { - io_buffer_unmap(ctx, &imu); + if (tag) { + if (!node) { + err = -EINVAL; break; } - ctx->user_bufs[i] = (struct io_mapped_ubuf *)&dummy_ubuf; + node->tag = tag; } - - ctx->user_bufs[i] = imu; - *io_get_tag_slot(ctx->buf_data, i) = tag; + i = array_index_nospec(up->offset + done, ctx->buf_table.nr); + io_reset_rsrc_node(ctx, &ctx->buf_table, i); + ctx->buf_table.nodes[i] = node; if (ctx->compat) user_data += sizeof(struct compat_iovec); else @@ -563,7 +447,7 @@ static int io_files_update_with_index_alloc(struct io_kiocb *req, struct file *file; int ret, fd; - if (!req->ctx->file_data) + if (!req->ctx->file_table.data.nr) return -ENXIO; for (done = 0; done < up->nr_args; done++) { @@ -619,68 +503,37 @@ int io_files_update(struct io_kiocb *req, unsigned int issue_flags) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc) +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { - struct io_ring_ctx *ctx = data->ctx; - struct io_rsrc_node *node = ctx->rsrc_node; - u64 *tag_slot = io_get_tag_slot(data, idx); - - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - if (unlikely(!ctx->rsrc_node)) { - ctx->rsrc_node = node; - return -ENOMEM; - } - - node->item.rsrc = rsrc; - node->type = data->rsrc_type; - node->item.tag = *tag_slot; - *tag_slot = 0; - list_add_tail(&node->node, &ctx->rsrc_ref_list); - io_put_rsrc_node(ctx, node); - return 0; -} + if (node->tag) + io_post_aux_cqe(ctx, node->tag, 0, 0); -void __io_sqe_files_unregister(struct io_ring_ctx *ctx) -{ - int i; - - for (i = 0; i < ctx->nr_user_files; i++) { - struct file *file = io_file_from_index(&ctx->file_table, i); - - if (!file) - continue; - io_file_bitmap_clear(&ctx->file_table, i); - fput(file); + switch (node->type) { + case IORING_RSRC_FILE: + fput(io_slot_file(node)); + break; + case IORING_RSRC_BUFFER: + io_buffer_unmap(ctx, node->buf); + break; + default: + WARN_ON_ONCE(1); + break; } - io_free_file_tables(&ctx->file_table); - io_file_table_set_alloc_range(ctx, 0, 0); - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; - ctx->nr_user_files = 0; + io_cache_free(&ctx->node_cache, node); } int io_sqe_files_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_files; - int ret; - - if (!ctx->file_data) + if (!ctx->file_table.data.nr) return -ENXIO; - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_files = 0; - ret = io_rsrc_ref_quiesce(ctx->file_data, ctx); - ctx->nr_user_files = nr; - if (!ret) - __io_sqe_files_unregister(ctx); - return ret; + io_free_file_tables(ctx, &ctx->file_table); + io_file_table_set_alloc_range(ctx, 0, 0); + return 0; } int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, @@ -691,7 +544,7 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, int fd, ret; unsigned i; - if (ctx->file_data) + if (ctx->file_table.data.nr) return -EBUSY; if (!nr_args) return -EINVAL; @@ -699,28 +552,22 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EMFILE; if (nr_args > rlimit(RLIMIT_NOFILE)) return -EMFILE; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_FILE, tags, nr_args, - &ctx->file_data); - if (ret) - return ret; - - if (!io_alloc_file_tables(&ctx->file_table, nr_args)) { - io_rsrc_data_free(ctx->file_data); - ctx->file_data = NULL; + if (!io_alloc_file_tables(ctx, &ctx->file_table, nr_args)) return -ENOMEM; - } - for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { - struct io_fixed_file *file_slot; + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; - if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) { - ret = -EFAULT; + ret = -EFAULT; + if (tags && copy_from_user(&tag, &tags[i], sizeof(tag))) + goto fail; + if (fds && copy_from_user(&fd, &fds[i], sizeof(fd))) goto fail; - } /* allow sparse sets */ if (!fds || fd == -1) { ret = -EINVAL; - if (unlikely(*io_get_tag_slot(ctx->file_data, i))) + if (tag) goto fail; continue; } @@ -737,56 +584,34 @@ int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, fput(file); goto fail; } - file_slot = io_fixed_file_slot(&ctx->file_table, i); - io_fixed_file_set(file_slot, file); + ret = -ENOMEM; + node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE); + if (!node) { + fput(file); + goto fail; + } + if (tag) + node->tag = tag; + ctx->file_table.data.nodes[i] = node; + io_fixed_file_set(node, file); io_file_bitmap_set(&ctx->file_table, i); } /* default it to the whole table */ - io_file_table_set_alloc_range(ctx, 0, ctx->nr_user_files); + io_file_table_set_alloc_range(ctx, 0, ctx->file_table.data.nr); return 0; fail: - __io_sqe_files_unregister(ctx); + io_clear_table_tags(&ctx->file_table.data); + io_sqe_files_unregister(ctx); return ret; } -static void io_rsrc_buf_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc) -{ - io_buffer_unmap(ctx, &prsrc->buf); - prsrc->buf = NULL; -} - -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx) -{ - unsigned int i; - - for (i = 0; i < ctx->nr_user_bufs; i++) - io_buffer_unmap(ctx, &ctx->user_bufs[i]); - kfree(ctx->user_bufs); - io_rsrc_data_free(ctx->buf_data); - ctx->user_bufs = NULL; - ctx->buf_data = NULL; - ctx->nr_user_bufs = 0; -} - int io_sqe_buffers_unregister(struct io_ring_ctx *ctx) { - unsigned nr = ctx->nr_user_bufs; - int ret; - - if (!ctx->buf_data) + if (!ctx->buf_table.nr) return -ENXIO; - - /* - * Quiesce may unlock ->uring_lock, and while it's not held - * prevent new requests using the table. - */ - ctx->nr_user_bufs = 0; - ret = io_rsrc_ref_quiesce(ctx->buf_data, ctx); - ctx->nr_user_bufs = nr; - if (!ret) - __io_sqe_buffers_unregister(ctx); - return ret; + io_rsrc_data_free(ctx, &ctx->buf_table); + return 0; } /* @@ -812,9 +637,13 @@ static bool headpage_already_acct(struct io_ring_ctx *ctx, struct page **pages, } /* check previously registered pages */ - for (i = 0; i < ctx->nr_user_bufs; i++) { - struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + for (i = 0; i < ctx->buf_table.nr; i++) { + struct io_rsrc_node *node = ctx->buf_table.nodes[i]; + struct io_mapped_ubuf *imu; + if (!node) + continue; + imu = node->buf; for (j = 0; j < imu->nr_bvecs; j++) { if (!PageCompound(imu->bvec[j].bv_page)) continue; @@ -858,68 +687,60 @@ static int io_buffer_account_pin(struct io_ring_ctx *ctx, struct page **pages, return ret; } -static bool io_do_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data, int nr_folios) +static bool io_coalesce_buffer(struct page ***pages, int *nr_pages, + struct io_imu_folio_data *data) { struct page **page_array = *pages, **new_array = NULL; - int nr_pages_left = *nr_pages, i, j; + unsigned nr_pages_left = *nr_pages; + unsigned nr_folios = data->nr_folios; + unsigned i, j; /* Store head pages only*/ - new_array = kvmalloc_array(nr_folios, sizeof(struct page *), - GFP_KERNEL); + new_array = kvmalloc_array(nr_folios, sizeof(struct page *), GFP_KERNEL); if (!new_array) return false; - new_array[0] = compound_head(page_array[0]); - /* - * The pages are bound to the folio, it doesn't - * actually unpin them but drops all but one reference, - * which is usually put down by io_buffer_unmap(). - * Note, needs a better helper. - */ - if (data->nr_pages_head > 1) - unpin_user_pages(&page_array[1], data->nr_pages_head - 1); - - j = data->nr_pages_head; - nr_pages_left -= data->nr_pages_head; - for (i = 1; i < nr_folios; i++) { - unsigned int nr_unpin; - - new_array[i] = page_array[j]; - nr_unpin = min_t(unsigned int, nr_pages_left - 1, - data->nr_pages_mid - 1); - if (nr_unpin) - unpin_user_pages(&page_array[j+1], nr_unpin); - j += data->nr_pages_mid; - nr_pages_left -= data->nr_pages_mid; + for (i = 0, j = 0; i < nr_folios; i++) { + struct page *p = compound_head(page_array[j]); + struct folio *folio = page_folio(p); + unsigned int nr; + + WARN_ON_ONCE(i > 0 && p != page_array[j]); + + nr = i ? data->nr_pages_mid : data->nr_pages_head; + nr = min(nr, nr_pages_left); + /* Drop all but one ref, the entire folio will remain pinned. */ + if (nr > 1) + unpin_user_folio(folio, nr - 1); + j += nr; + nr_pages_left -= nr; + new_array[i] = p; } + + WARN_ON_ONCE(j != *nr_pages); + kvfree(page_array); *pages = new_array; *nr_pages = nr_folios; return true; } -static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, - struct io_imu_folio_data *data) +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data) { - struct page **page_array = *pages; struct folio *folio = page_folio(page_array[0]); unsigned int count = 1, nr_folios = 1; int i; - if (*nr_pages <= 1) - return false; - data->nr_pages_mid = folio_nr_pages(folio); - if (data->nr_pages_mid == 1) - return false; - data->folio_shift = folio_shift(folio); + data->first_folio_page_idx = folio_page_idx(folio, page_array[0]); + /* * Check if pages are contiguous inside a folio, and all folios have * the same page count except for the head and tail. */ - for (i = 1; i < *nr_pages; i++) { + for (i = 1; i < nr_pages; i++) { if (page_folio(page_array[i]) == folio && page_array[i] == page_array[i-1] + 1) { count++; @@ -947,24 +768,29 @@ static bool io_try_coalesce_buffer(struct page ***pages, int *nr_pages, if (nr_folios == 1) data->nr_pages_head = count; - return io_do_coalesce_buffer(pages, nr_pages, data, nr_folios); + data->nr_folios = nr_folios; + return true; } -static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, - struct io_mapped_ubuf **pimu, - struct page **last_hpage) +static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx, + struct iovec *iov, + struct page **last_hpage) { struct io_mapped_ubuf *imu = NULL; struct page **pages = NULL; + struct io_rsrc_node *node; unsigned long off; size_t size; int ret, nr_pages, i; struct io_imu_folio_data data; - bool coalesced; + bool coalesced = false; - *pimu = (struct io_mapped_ubuf *)&dummy_ubuf; if (!iov->iov_base) - return 0; + return NULL; + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) + return ERR_PTR(-ENOMEM); ret = -ENOMEM; pages = io_pin_pages((unsigned long) iov->iov_base, iov->iov_len, @@ -976,29 +802,38 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, } /* If it's huge page(s), try to coalesce them into fewer bvec entries */ - coalesced = io_try_coalesce_buffer(&pages, &nr_pages, &data); + if (nr_pages > 1 && io_check_coalesce_buffer(pages, nr_pages, &data)) { + if (data.nr_pages_mid != 1) + coalesced = io_coalesce_buffer(&pages, &nr_pages, &data); + } - imu = kvmalloc(struct_size(imu, bvec, nr_pages), GFP_KERNEL); + imu = io_alloc_imu(ctx, nr_pages); if (!imu) goto done; + imu->nr_bvecs = nr_pages; ret = io_buffer_account_pin(ctx, pages, nr_pages, imu, last_hpage); - if (ret) { - unpin_user_pages(pages, nr_pages); + if (ret) goto done; - } size = iov->iov_len; /* store original address for later verification */ imu->ubuf = (unsigned long) iov->iov_base; imu->len = iov->iov_len; - imu->nr_bvecs = nr_pages; imu->folio_shift = PAGE_SHIFT; + imu->release = io_release_ubuf; + imu->priv = imu; + imu->is_kbuf = false; + imu->dir = IO_IMU_DEST | IO_IMU_SOURCE; if (coalesced) imu->folio_shift = data.folio_shift; refcount_set(&imu->refs, 1); - off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1); - *pimu = imu; + + off = (unsigned long)iov->iov_base & ~PAGE_MASK; + if (coalesced) + off += data.first_folio_page_idx << PAGE_SHIFT; + + node->buf = imu; ret = 0; for (i = 0; i < nr_pages; i++) { @@ -1010,46 +845,46 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, size -= vec_len; } done: - if (ret) - kvfree(imu); + if (ret) { + if (imu) + io_free_imu(ctx, imu); + if (pages) { + for (i = 0; i < nr_pages; i++) + unpin_user_folio(page_folio(pages[i]), 1); + } + io_cache_free(&ctx->node_cache, node); + node = ERR_PTR(ret); + } kvfree(pages); - return ret; -} - -static int io_buffers_map_alloc(struct io_ring_ctx *ctx, unsigned int nr_args) -{ - ctx->user_bufs = kcalloc(nr_args, sizeof(*ctx->user_bufs), GFP_KERNEL); - return ctx->user_bufs ? 0 : -ENOMEM; + return node; } int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags) { struct page *last_hpage = NULL; - struct io_rsrc_data *data; + struct io_rsrc_data data; struct iovec fast_iov, *iov = &fast_iov; const struct iovec __user *uvec; int i, ret; BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); - if (ctx->user_bufs) + if (ctx->buf_table.nr) return -EBUSY; if (!nr_args || nr_args > IORING_MAX_REG_BUFFERS) return -EINVAL; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, tags, nr_args, &data); + ret = io_rsrc_data_alloc(&data, nr_args); if (ret) return ret; - ret = io_buffers_map_alloc(ctx, nr_args); - if (ret) { - io_rsrc_data_free(data); - return ret; - } if (!arg) memset(iov, 0, sizeof(*iov)); - for (i = 0; i < nr_args; i++, ctx->nr_user_bufs++) { + for (i = 0; i < nr_args; i++) { + struct io_rsrc_node *node; + u64 tag = 0; + if (arg) { uvec = (struct iovec __user *) arg; iov = iovec_from_user(uvec, 1, 1, &fast_iov, ctx->compat); @@ -1066,142 +901,366 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, arg += sizeof(struct iovec); } - if (!iov->iov_base && *io_get_tag_slot(data, i)) { - ret = -EINVAL; - break; + if (tags) { + if (copy_from_user(&tag, &tags[i], sizeof(tag))) { + ret = -EFAULT; + break; + } } - ret = io_sqe_buffer_register(ctx, iov, &ctx->user_bufs[i], - &last_hpage); - if (ret) + node = io_sqe_buffer_register(ctx, iov, &last_hpage); + if (IS_ERR(node)) { + ret = PTR_ERR(node); break; + } + if (tag) { + if (!node) { + ret = -EINVAL; + break; + } + node->tag = tag; + } + data.nodes[i] = node; } - WARN_ON_ONCE(ctx->buf_data); + ctx->buf_table = data; + if (ret) { + io_clear_table_tags(&ctx->buf_table); + io_sqe_buffers_unregister(ctx); + } + return ret; +} - ctx->buf_data = data; - if (ret) - __io_sqe_buffers_unregister(ctx); +int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq, + void (*release)(void *), unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct req_iterator rq_iter; + struct io_mapped_ubuf *imu; + struct io_rsrc_node *node; + struct bio_vec bv, *bvec; + u16 nr_bvecs; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + if (data->nodes[index]) { + ret = -EBUSY; + goto unlock; + } + + node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!node) { + ret = -ENOMEM; + goto unlock; + } + + nr_bvecs = blk_rq_nr_phys_segments(rq); + imu = io_alloc_imu(ctx, nr_bvecs); + if (!imu) { + kfree(node); + ret = -ENOMEM; + goto unlock; + } + + imu->ubuf = 0; + imu->len = blk_rq_bytes(rq); + imu->acct_pages = 0; + imu->folio_shift = PAGE_SHIFT; + imu->nr_bvecs = nr_bvecs; + refcount_set(&imu->refs, 1); + imu->release = release; + imu->priv = rq; + imu->is_kbuf = true; + imu->dir = 1 << rq_data_dir(rq); + + bvec = imu->bvec; + rq_for_each_bvec(bv, rq, rq_iter) + *bvec++ = bv; + + node->buf = imu; + data->nodes[index] = node; +unlock: + io_ring_submit_unlock(ctx, issue_flags); return ret; } +EXPORT_SYMBOL_GPL(io_buffer_register_bvec); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +int io_buffer_unregister_bvec(struct io_uring_cmd *cmd, unsigned int index, + unsigned int issue_flags) +{ + struct io_ring_ctx *ctx = cmd_to_io_kiocb(cmd)->ctx; + struct io_rsrc_data *data = &ctx->buf_table; + struct io_rsrc_node *node; + int ret = 0; + + io_ring_submit_lock(ctx, issue_flags); + if (index >= data->nr) { + ret = -EINVAL; + goto unlock; + } + index = array_index_nospec(index, data->nr); + + node = data->nodes[index]; + if (!node) { + ret = -EINVAL; + goto unlock; + } + if (!node->buf->is_kbuf) { + ret = -EBUSY; + goto unlock; + } + + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; +unlock: + io_ring_submit_unlock(ctx, issue_flags); + return ret; +} +EXPORT_SYMBOL_GPL(io_buffer_unregister_bvec); + +static int validate_fixed_range(u64 buf_addr, size_t len, + const struct io_mapped_ubuf *imu) { u64 buf_end; - size_t offset; - if (WARN_ON_ONCE(!imu)) - return -EFAULT; if (unlikely(check_add_overflow(buf_addr, (u64)len, &buf_end))) return -EFAULT; /* not inside the mapped region */ if (unlikely(buf_addr < imu->ubuf || buf_end > (imu->ubuf + imu->len))) return -EFAULT; + if (unlikely(len > MAX_RW_COUNT)) + return -EFAULT; + return 0; +} - /* - * Might not be a start of buffer, set size appropriately - * and advance us to the beginning. - */ - offset = buf_addr - imu->ubuf; - iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); - - if (offset) { - /* - * Don't use iov_iter_advance() here, as it's really slow for - * using the latter parts of a big fixed buffer - it iterates - * over each segment manually. We can cheat a bit here, because - * we know that: - * - * 1) it's a BVEC iter, we set it up - * 2) all bvecs are the same in size, except potentially the - * first and last bvec - * - * So just find our index, and adjust the iterator afterwards. - * If the offset is within the first bvec (or the whole first - * bvec, just use iov_iter_advance(). This makes it easier - * since we can just skip the first segment, which may not - * be folio_size aligned. - */ - const struct bio_vec *bvec = imu->bvec; +static int io_import_kbuf(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, size_t len, size_t offset) +{ + size_t count = len + offset; - if (offset < bvec->bv_len) { - iter->bvec = bvec; - iter->count -= offset; - iter->iov_offset = offset; - } else { - unsigned long seg_skip; + iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, count); + iov_iter_advance(iter, offset); - /* skip first vec */ - offset -= bvec->bv_len; - seg_skip = 1 + (offset >> imu->folio_shift); + if (count < imu->len) { + const struct bio_vec *bvec = iter->bvec; - iter->bvec = bvec + seg_skip; - iter->nr_segs -= seg_skip; - iter->count -= bvec->bv_len + offset; - iter->iov_offset = offset & ((1UL << imu->folio_shift) - 1); + while (len > bvec->bv_len) { + len -= bvec->bv_len; + bvec++; } + iter->nr_segs = 1 + bvec - iter->bvec; } + return 0; +} + +static int io_import_fixed(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len) +{ + const struct bio_vec *bvec; + size_t folio_mask; + unsigned nr_segs; + size_t offset; + int ret; + ret = validate_fixed_range(buf_addr, len, imu); + if (unlikely(ret)) + return ret; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + offset = buf_addr - imu->ubuf; + + if (imu->is_kbuf) + return io_import_kbuf(ddir, iter, imu, len, offset); + + /* + * Don't use iov_iter_advance() here, as it's really slow for + * using the latter parts of a big fixed buffer - it iterates + * over each segment manually. We can cheat a bit here for user + * registered nodes, because we know that: + * + * 1) it's a BVEC iter, we set it up + * 2) all bvecs are the same in size, except potentially the + * first and last bvec + */ + folio_mask = (1UL << imu->folio_shift) - 1; + bvec = imu->bvec; + if (offset >= bvec->bv_len) { + unsigned long seg_skip; + + /* skip first vec */ + offset -= bvec->bv_len; + seg_skip = 1 + (offset >> imu->folio_shift); + bvec += seg_skip; + offset &= folio_mask; + } + nr_segs = (offset + len + bvec->bv_offset + folio_mask) >> imu->folio_shift; + iov_iter_bvec(iter, ddir, bvec, nr_segs, len); + iter->iov_offset = offset; return 0; } -static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx) +inline struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags) { - struct io_mapped_ubuf **user_bufs; - struct io_rsrc_data *data; - int i, ret, nbufs; + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + + if (req->flags & REQ_F_BUF_NODE) + return req->buf_node; + req->flags |= REQ_F_BUF_NODE; + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->buf_table, req->buf_index); + if (node) { + node->refs++; + req->buf_node = node; + io_ring_submit_unlock(ctx, issue_flags); + return node; + } + req->flags &= ~REQ_F_BUF_NODE; + io_ring_submit_unlock(ctx, issue_flags); + return NULL; +} + +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags) +{ + struct io_rsrc_node *node; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + return io_import_fixed(ddir, iter, node->buf, buf_addr, len); +} + +/* Lock two rings at once. The rings must be different! */ +static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2) +{ + if (ctx1 > ctx2) + swap(ctx1, ctx2); + mutex_lock(&ctx1->uring_lock); + mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING); +} + +/* Both rings are locked by the caller. */ +static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx, + struct io_uring_clone_buffers *arg) +{ + struct io_rsrc_data data; + int i, ret, off, nr; + unsigned int nbufs; + + lockdep_assert_held(&ctx->uring_lock); + lockdep_assert_held(&src_ctx->uring_lock); /* - * Drop our own lock here. We'll setup the data we need and reference - * the source buffers, then re-grab, check, and assign at the end. + * Accounting state is shared between the two rings; that only works if + * both rings are accounted towards the same counters. */ - mutex_unlock(&ctx->uring_lock); + if (ctx->user != src_ctx->user || ctx->mm_account != src_ctx->mm_account) + return -EINVAL; - mutex_lock(&src_ctx->uring_lock); - ret = -ENXIO; - nbufs = src_ctx->nr_user_bufs; - if (!nbufs) - goto out_unlock; - ret = io_rsrc_data_alloc(ctx, IORING_RSRC_BUFFER, NULL, nbufs, &data); - if (ret) - goto out_unlock; + /* if offsets are given, must have nr specified too */ + if (!arg->nr && (arg->dst_off || arg->src_off)) + return -EINVAL; + /* not allowed unless REPLACE is set */ + if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE)) + return -EBUSY; - ret = -ENOMEM; - user_bufs = kcalloc(nbufs, sizeof(*ctx->user_bufs), GFP_KERNEL); - if (!user_bufs) - goto out_free_data; + nbufs = src_ctx->buf_table.nr; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + return -EINVAL; + else if (arg->nr > IORING_MAX_REG_BUFFERS) + return -EINVAL; + if (check_add_overflow(arg->nr, arg->dst_off, &nbufs)) + return -EOVERFLOW; + if (nbufs > IORING_MAX_REG_BUFFERS) + return -EINVAL; - for (i = 0; i < nbufs; i++) { - struct io_mapped_ubuf *src = src_ctx->user_bufs[i]; + ret = io_rsrc_data_alloc(&data, max(nbufs, ctx->buf_table.nr)); + if (ret) + return ret; - if (src != &dummy_ubuf) - refcount_inc(&src->refs); - user_bufs[i] = src; + /* Fill entries in data from dst that won't overlap with src */ + for (i = 0; i < min(arg->dst_off, ctx->buf_table.nr); i++) { + struct io_rsrc_node *src_node = ctx->buf_table.nodes[i]; + + if (src_node) { + data.nodes[i] = src_node; + src_node->refs++; + } } - /* Have a ref on the bufs now, drop src lock and re-grab our own lock */ - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); - if (!ctx->user_bufs) { - ctx->user_bufs = user_bufs; - ctx->buf_data = data; - ctx->nr_user_bufs = nbufs; - return 0; + ret = -ENXIO; + nbufs = src_ctx->buf_table.nr; + if (!nbufs) + goto out_free; + ret = -EINVAL; + if (!arg->nr) + arg->nr = nbufs; + else if (arg->nr > nbufs) + goto out_free; + ret = -EOVERFLOW; + if (check_add_overflow(arg->nr, arg->src_off, &off)) + goto out_free; + if (off > nbufs) + goto out_free; + + off = arg->dst_off; + i = arg->src_off; + nr = arg->nr; + while (nr--) { + struct io_rsrc_node *dst_node, *src_node; + + src_node = io_rsrc_node_lookup(&src_ctx->buf_table, i); + if (!src_node) { + dst_node = NULL; + } else { + dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER); + if (!dst_node) { + ret = -ENOMEM; + goto out_free; + } + + refcount_inc(&src_node->buf->refs); + dst_node->buf = src_node->buf; + } + data.nodes[off++] = dst_node; + i++; } - /* someone raced setting up buffers, dump ours */ - for (i = 0; i < nbufs; i++) - io_buffer_unmap(ctx, &user_bufs[i]); - io_rsrc_data_free(data); - kfree(user_bufs); - return -EBUSY; -out_free_data: - io_rsrc_data_free(data); -out_unlock: - mutex_unlock(&src_ctx->uring_lock); - mutex_lock(&ctx->uring_lock); + /* + * If asked for replace, put the old table. data->nodes[] holds both + * old and new nodes at this point. + */ + if (arg->flags & IORING_REGISTER_DST_REPLACE) + io_rsrc_data_free(ctx, &ctx->buf_table); + + /* + * ctx->buf_table must be empty now - either the contents are being + * replaced and we just freed the table, or the contents are being + * copied to a ring that does not have buffers yet (checked at function + * entry). + */ + WARN_ON_ONCE(ctx->buf_table.nr); + ctx->buf_table = data; + return 0; + +out_free: + io_rsrc_data_free(ctx, &data); return ret; } @@ -1215,16 +1274,17 @@ out_unlock: int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_clone_buffers buf; + struct io_ring_ctx *src_ctx; bool registered_src; struct file *file; int ret; - if (ctx->user_bufs || ctx->nr_user_bufs) - return -EBUSY; if (copy_from_user(&buf, arg, sizeof(buf))) return -EFAULT; - if (buf.flags & ~IORING_REGISTER_SRC_REGISTERED) + if (buf.flags & ~(IORING_REGISTER_SRC_REGISTERED|IORING_REGISTER_DST_REPLACE)) return -EINVAL; + if (!(buf.flags & IORING_REGISTER_DST_REPLACE) && ctx->buf_table.nr) + return -EBUSY; if (memchr_inv(buf.pad, 0, sizeof(buf.pad))) return -EINVAL; @@ -1232,8 +1292,270 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg) file = io_uring_register_get_file(buf.src_fd, registered_src); if (IS_ERR(file)) return PTR_ERR(file); - ret = io_clone_buffers(ctx, file->private_data); - if (!registered_src) - fput(file); + + src_ctx = file->private_data; + if (src_ctx != ctx) { + mutex_unlock(&ctx->uring_lock); + lock_two_rings(ctx, src_ctx); + } + + ret = io_clone_buffers(ctx, src_ctx, &buf); + + if (src_ctx != ctx) + mutex_unlock(&src_ctx->uring_lock); + + fput(file); return ret; } + +void io_vec_free(struct iou_vec *iv) +{ + if (!iv->iovec) + return; + kfree(iv->iovec); + iv->iovec = NULL; + iv->nr = 0; +} + +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries) +{ + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + struct iovec *iov; + + iov = kmalloc_array(nr_entries, sizeof(iov[0]), gfp); + if (!iov) + return -ENOMEM; + + io_vec_free(iv); + iv->iovec = iov; + iv->nr = nr_entries; + return 0; +} + +static int io_vec_fill_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + unsigned long folio_size = 1 << imu->folio_shift; + unsigned long folio_mask = folio_size - 1; + struct bio_vec *res_bvec = vec->bvec; + size_t total_len = 0; + unsigned bvec_idx = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t iov_len = iovec[iov_idx].iov_len; + u64 buf_addr = (u64)(uintptr_t)iovec[iov_idx].iov_base; + struct bio_vec *src_bvec; + size_t offset; + int ret; + + ret = validate_fixed_range(buf_addr, iov_len, imu); + if (unlikely(ret)) + return ret; + + if (unlikely(!iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov_len, &total_len))) + return -EOVERFLOW; + + offset = buf_addr - imu->ubuf; + /* + * Only the first bvec can have non zero bv_offset, account it + * here and work with full folios below. + */ + offset += imu->bvec[0].bv_offset; + + src_bvec = imu->bvec + (offset >> imu->folio_shift); + offset &= folio_mask; + + for (; iov_len; offset = 0, bvec_idx++, src_bvec++) { + size_t seg_size = min_t(size_t, iov_len, + folio_size - offset); + + bvec_set_page(&res_bvec[bvec_idx], + src_bvec->bv_page, seg_size, offset); + iov_len -= seg_size; + } + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + + iov_iter_bvec(iter, ddir, res_bvec, bvec_idx, total_len); + return 0; +} + +static int io_estimate_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu) +{ + unsigned shift = imu->folio_shift; + size_t max_segs = 0; + unsigned i; + + for (i = 0; i < nr_iovs; i++) + max_segs += (iov[i].iov_len >> shift) + 2; + return max_segs; +} + +static int io_vec_fill_kern_bvec(int ddir, struct iov_iter *iter, + struct io_mapped_ubuf *imu, + struct iovec *iovec, unsigned nr_iovs, + struct iou_vec *vec) +{ + const struct bio_vec *src_bvec = imu->bvec; + struct bio_vec *res_bvec = vec->bvec; + unsigned res_idx = 0; + size_t total_len = 0; + unsigned iov_idx; + + for (iov_idx = 0; iov_idx < nr_iovs; iov_idx++) { + size_t offset = (size_t)(uintptr_t)iovec[iov_idx].iov_base; + size_t iov_len = iovec[iov_idx].iov_len; + struct bvec_iter bi = { + .bi_size = offset + iov_len, + }; + struct bio_vec bv; + + bvec_iter_advance(src_bvec, &bi, offset); + for_each_mp_bvec(bv, src_bvec, bi, bi) + res_bvec[res_idx++] = bv; + total_len += iov_len; + } + iov_iter_bvec(iter, ddir, res_bvec, res_idx, total_len); + return 0; +} + +static int iov_kern_bvec_size(const struct iovec *iov, + const struct io_mapped_ubuf *imu, + unsigned int *nr_seg) +{ + size_t offset = (size_t)(uintptr_t)iov->iov_base; + const struct bio_vec *bvec = imu->bvec; + int start = 0, i = 0; + size_t off = 0; + int ret; + + ret = validate_fixed_range(offset, iov->iov_len, imu); + if (unlikely(ret)) + return ret; + + for (i = 0; off < offset + iov->iov_len && i < imu->nr_bvecs; + off += bvec[i].bv_len, i++) { + if (offset >= off && offset < off + bvec[i].bv_len) + start = i; + } + *nr_seg = i - start; + return 0; +} + +static int io_kern_bvec_size(struct iovec *iov, unsigned nr_iovs, + struct io_mapped_ubuf *imu, unsigned *nr_segs) +{ + unsigned max_segs = 0; + size_t total_len = 0; + unsigned i; + int ret; + + *nr_segs = 0; + for (i = 0; i < nr_iovs; i++) { + if (unlikely(!iov[i].iov_len)) + return -EFAULT; + if (unlikely(check_add_overflow(total_len, iov[i].iov_len, + &total_len))) + return -EOVERFLOW; + ret = iov_kern_bvec_size(&iov[i], imu, &max_segs); + if (unlikely(ret)) + return ret; + *nr_segs += max_segs; + } + if (total_len > MAX_RW_COUNT) + return -EINVAL; + return 0; +} + +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags) +{ + struct io_rsrc_node *node; + struct io_mapped_ubuf *imu; + unsigned iovec_off; + struct iovec *iov; + unsigned nr_segs; + + node = io_find_buf_node(req, issue_flags); + if (!node) + return -EFAULT; + imu = node->buf; + if (!(imu->dir & (1 << ddir))) + return -EFAULT; + + iovec_off = vec->nr - nr_iovs; + iov = vec->iovec + iovec_off; + + if (imu->is_kbuf) { + int ret = io_kern_bvec_size(iov, nr_iovs, imu, &nr_segs); + + if (unlikely(ret)) + return ret; + } else { + nr_segs = io_estimate_bvec_size(iov, nr_iovs, imu); + } + + if (sizeof(struct bio_vec) > sizeof(struct iovec)) { + size_t bvec_bytes; + + bvec_bytes = nr_segs * sizeof(struct bio_vec); + nr_segs = (bvec_bytes + sizeof(*iov) - 1) / sizeof(*iov); + nr_segs += nr_iovs; + } + + if (nr_segs > vec->nr) { + struct iou_vec tmp_vec = {}; + int ret; + + ret = io_vec_realloc(&tmp_vec, nr_segs); + if (ret) + return ret; + + iovec_off = tmp_vec.nr - nr_iovs; + memcpy(tmp_vec.iovec + iovec_off, iov, sizeof(*iov) * nr_iovs); + io_vec_free(vec); + + *vec = tmp_vec; + iov = vec->iovec + iovec_off; + req->flags |= REQ_F_NEED_CLEANUP; + } + + if (imu->is_kbuf) + return io_vec_fill_kern_bvec(ddir, iter, imu, iov, nr_iovs, vec); + + return io_vec_fill_bvec(ddir, iter, imu, iov, nr_iovs, vec); +} + +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs) +{ + struct iovec *iov; + int iovec_off, ret; + void *res; + + if (uvec_segs > iv->nr) { + ret = io_vec_realloc(iv, uvec_segs); + if (ret) + return ret; + req->flags |= REQ_F_NEED_CLEANUP; + } + + /* pad iovec to the right */ + iovec_off = iv->nr - uvec_segs; + iov = iv->iovec + iovec_off; + res = iovec_from_user(uvec, uvec_segs, uvec_segs, iov, + io_is_compat(req->ctx)); + if (IS_ERR(res)) + return PTR_ERR(res); + + req->flags |= REQ_F_IMPORT_BUFFER; + return 0; +} diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index 8ed588036210..25e7e998dcfd 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -2,42 +2,30 @@ #ifndef IOU_RSRC_H #define IOU_RSRC_H -#define IO_NODE_ALLOC_CACHE_MAX 32 +#include <linux/io_uring_types.h> +#include <linux/lockdep.h> -#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) -#define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) -#define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) +#define IO_VEC_CACHE_SOFT_CAP 256 enum { IORING_RSRC_FILE = 0, IORING_RSRC_BUFFER = 1, }; -struct io_rsrc_put { +struct io_rsrc_node { + unsigned char type; + int refs; + u64 tag; union { - void *rsrc; - struct file *file; + unsigned long file_ptr; struct io_mapped_ubuf *buf; }; }; -struct io_rsrc_data { - struct io_ring_ctx *ctx; - - u64 **tags; - unsigned int nr; - u16 rsrc_type; - bool quiesce; -}; - -struct io_rsrc_node { - struct io_ring_ctx *ctx; - int refs; - bool empty; - u16 type; - struct list_head node; - struct io_rsrc_put item; +enum { + IO_IMU_DEST = 1 << ITER_DEST, + IO_IMU_SOURCE = 1 << ITER_SOURCE, }; struct io_mapped_ubuf { @@ -47,6 +35,10 @@ struct io_mapped_ubuf { unsigned int folio_shift; refcount_t refs; unsigned long acct_pages; + void (*release)(void *); + void *priv; + bool is_kbuf; + u8 dir; struct bio_vec bvec[] __counted_by(nr_bvecs); }; @@ -56,23 +48,32 @@ struct io_imu_folio_data { /* For non-head/tail folios, has to be fully included */ unsigned int nr_pages_mid; unsigned int folio_shift; + unsigned int nr_folios; + unsigned long first_folio_page_idx; }; -void io_rsrc_node_ref_zero(struct io_rsrc_node *node); -void io_rsrc_node_destroy(struct io_ring_ctx *ctx, struct io_rsrc_node *ref_node); -struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); -int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void *rsrc); - -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +bool io_rsrc_cache_init(struct io_ring_ctx *ctx); +void io_rsrc_cache_free(struct io_ring_ctx *ctx); +struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx, int type); +void io_free_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node); +void io_rsrc_data_free(struct io_ring_ctx *ctx, struct io_rsrc_data *data); +int io_rsrc_data_alloc(struct io_rsrc_data *data, unsigned nr); + +struct io_rsrc_node *io_find_buf_node(struct io_kiocb *req, + unsigned issue_flags); +int io_import_reg_buf(struct io_kiocb *req, struct iov_iter *iter, + u64 buf_addr, size_t len, int ddir, + unsigned issue_flags); +int io_import_reg_vec(int ddir, struct iov_iter *iter, + struct io_kiocb *req, struct iou_vec *vec, + unsigned nr_iovs, unsigned issue_flags); +int io_prep_reg_iovec(struct io_kiocb *req, struct iou_vec *iv, + const struct iovec __user *uvec, size_t uvec_segs); int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg); -void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int nr_args, u64 __user *tags); -void __io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_unregister(struct io_ring_ctx *ctx); int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags); @@ -83,52 +84,36 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type); int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type); +int io_validate_user_buf_range(u64 uaddr, u64 ulen); -static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) -{ - lockdep_assert_held(&ctx->uring_lock); - - if (node && !--node->refs) - io_rsrc_node_ref_zero(node); -} +bool io_check_coalesce_buffer(struct page **page_array, int nr_pages, + struct io_imu_folio_data *data); -static inline void io_charge_rsrc_node(struct io_ring_ctx *ctx, - struct io_rsrc_node *node) +static inline struct io_rsrc_node *io_rsrc_node_lookup(struct io_rsrc_data *data, + int index) { - node->refs++; + if (index < data->nr) + return data->nodes[array_index_nospec(index, data->nr)]; + return NULL; } -static inline void __io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx) +static inline void io_put_rsrc_node(struct io_ring_ctx *ctx, struct io_rsrc_node *node) { lockdep_assert_held(&ctx->uring_lock); - req->rsrc_node = ctx->rsrc_node; - io_charge_rsrc_node(ctx, ctx->rsrc_node); -} - -static inline void io_req_set_rsrc_node(struct io_kiocb *req, - struct io_ring_ctx *ctx, - unsigned int issue_flags) -{ - if (!req->rsrc_node) { - io_ring_submit_lock(ctx, issue_flags); - __io_req_set_rsrc_node(req, ctx); - io_ring_submit_unlock(ctx, issue_flags); - } + if (!--node->refs) + io_free_rsrc_node(ctx, node); } -static inline u64 *io_get_tag_slot(struct io_rsrc_data *data, unsigned int idx) +static inline bool io_reset_rsrc_node(struct io_ring_ctx *ctx, + struct io_rsrc_data *data, int index) { - unsigned int off = idx & IO_RSRC_TAG_TABLE_MASK; - unsigned int table_idx = idx >> IO_RSRC_TAG_TABLE_SHIFT; + struct io_rsrc_node *node = data->nodes[index]; - return &data->tags[table_idx][off]; -} - -static inline int io_rsrc_init(struct io_ring_ctx *ctx) -{ - ctx->rsrc_node = io_rsrc_node_alloc(ctx); - return ctx->rsrc_node ? 0 : -ENOMEM; + if (!node) + return false; + io_put_rsrc_node(ctx, node); + data->nodes[index] = NULL; + return true; } int io_files_update(struct io_kiocb *req, unsigned int issue_flags); @@ -142,4 +127,21 @@ static inline void __io_unaccount_mem(struct user_struct *user, atomic_long_sub(nr_pages, &user->locked_vm); } +void io_vec_free(struct iou_vec *iv); +int io_vec_realloc(struct iou_vec *iv, unsigned nr_entries); + +static inline void io_vec_reset_iovec(struct iou_vec *iv, + struct iovec *iovec, unsigned nr) +{ + io_vec_free(iv); + iv->iovec = iovec; + iv->nr = nr; +} + +static inline void io_alloc_cache_vec_kasan(struct iou_vec *iv) +{ + if (IS_ENABLED(CONFIG_KASAN)) + io_vec_free(iv); +} + #endif diff --git a/io_uring/rw.c b/io_uring/rw.c index 354c4e175654..710d8cd53ebb 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -23,6 +23,9 @@ #include "poll.h" #include "rw.h" +static void io_complete_rw(struct kiocb *kiocb, long res); +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res); + struct io_rw { /* NOTE: kiocb has the file as the first member, so don't do it here */ struct kiocb kiocb; @@ -46,24 +49,16 @@ static bool io_file_supports_nowait(struct io_kiocb *req, __poll_t mask) return false; } -#ifdef CONFIG_COMPAT static int io_iov_compat_buffer_select_prep(struct io_rw *rw) { - struct compat_iovec __user *uiov; - compat_ssize_t clen; + struct compat_iovec __user *uiov = u64_to_user_ptr(rw->addr); + struct compat_iovec iov; - uiov = u64_to_user_ptr(rw->addr); - if (!access_ok(uiov, sizeof(*uiov))) - return -EFAULT; - if (__get_user(clen, &uiov->iov_len)) + if (copy_from_user(&iov, uiov, sizeof(iov))) return -EFAULT; - if (clen < 0) - return -EINVAL; - - rw->len = clen; + rw->len = iov.iov_len; return 0; } -#endif static int io_iov_buffer_select_prep(struct io_kiocb *req) { @@ -74,10 +69,8 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) if (rw->len != 1) return -EINVAL; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_iov_compat_buffer_select_prep(rw); -#endif uiov = u64_to_user_ptr(rw->addr); if (copy_from_user(&iov, uiov, sizeof(*uiov))) @@ -86,59 +79,62 @@ static int io_iov_buffer_select_prep(struct io_kiocb *req) return 0; } -static int __io_import_iovec(int ddir, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int io_import_vec(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + const struct iovec __user *uvec, + size_t uvec_segs) { - const struct io_issue_def *def = &io_issue_defs[req->opcode]; - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + int ret, nr_segs; struct iovec *iov; - void __user *buf; - int nr_segs, ret; - size_t sqe_len; - - buf = u64_to_user_ptr(rw->addr); - sqe_len = rw->len; - - if (!def->vectored || req->flags & REQ_F_BUFFER_SELECT) { - if (io_do_buffer_select(req)) { - buf = io_buffer_select(req, &sqe_len, issue_flags); - if (!buf) - return -ENOBUFS; - rw->addr = (unsigned long) buf; - rw->len = sqe_len; - } - return import_ubuf(ddir, buf, sqe_len, &io->iter); - } - - if (io->free_iovec) { - nr_segs = io->free_iov_nr; - iov = io->free_iovec; + if (io->vec.iovec) { + nr_segs = io->vec.nr; + iov = io->vec.iovec; } else { - iov = &io->fast_iov; nr_segs = 1; + iov = &io->fast_iov; } - ret = __import_iovec(ddir, buf, sqe_len, nr_segs, &iov, &io->iter, - req->ctx->compat); + + ret = __import_iovec(ddir, uvec, uvec_segs, nr_segs, &iov, &io->iter, + io_is_compat(req->ctx)); if (unlikely(ret < 0)) return ret; if (iov) { req->flags |= REQ_F_NEED_CLEANUP; - io->free_iov_nr = io->iter.nr_segs; - kfree(io->free_iovec); - io->free_iovec = iov; + io_vec_reset_iovec(&io->vec, iov, io->iter.nr_segs); } return 0; } -static inline int io_import_iovec(int rw, struct io_kiocb *req, - struct io_async_rw *io, - unsigned int issue_flags) +static int __io_import_rw_buffer(int ddir, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) +{ + const struct io_issue_def *def = &io_issue_defs[req->opcode]; + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + void __user *buf = u64_to_user_ptr(rw->addr); + size_t sqe_len = rw->len; + + if (def->vectored && !(req->flags & REQ_F_BUFFER_SELECT)) + return io_import_vec(ddir, req, io, buf, sqe_len); + + if (io_do_buffer_select(req)) { + buf = io_buffer_select(req, &sqe_len, io->buf_group, issue_flags); + if (!buf) + return -ENOBUFS; + rw->addr = (unsigned long) buf; + rw->len = sqe_len; + } + return import_ubuf(ddir, buf, sqe_len, &io->iter); +} + +static inline int io_import_rw_buffer(int rw, struct io_kiocb *req, + struct io_async_rw *io, + unsigned int issue_flags) { int ret; - ret = __io_import_iovec(rw, req, io, issue_flags); + ret = __io_import_rw_buffer(rw, req, io, issue_flags); if (unlikely(ret < 0)) return ret; @@ -146,28 +142,18 @@ static inline int io_import_iovec(int rw, struct io_kiocb *req, return 0; } -static void io_rw_iovec_free(struct io_async_rw *rw) -{ - if (rw->free_iovec) { - kfree(rw->free_iovec); - rw->free_iov_nr = 0; - rw->free_iovec = NULL; - } -} - static void io_rw_recycle(struct io_kiocb *req, unsigned int issue_flags) { struct io_async_rw *rw = req->async_data; - struct iovec *iov; - if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) { - io_rw_iovec_free(rw); + if (unlikely(issue_flags & IO_URING_F_UNLOCKED)) return; - } - iov = rw->free_iovec; + + io_alloc_cache_vec_kasan(&rw->vec); + if (rw->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&rw->vec); + if (io_alloc_cache_put(&req->ctx->rw_cache, rw)) { - if (iov) - kasan_mempool_poison_object(iov); req->async_data = NULL; req->flags &= ~REQ_F_ASYNC_DATA; } @@ -202,7 +188,7 @@ static void io_req_rw_cleanup(struct io_kiocb *req, unsigned int issue_flags) * mean that the underlying data can be gone at any time. But that * should be fixed seperately, and then this check could be killed. */ - if (!(req->flags & REQ_F_REFCOUNT)) { + if (!(req->flags & (REQ_F_REISSUE | REQ_F_REFCOUNT))) { req->flags &= ~REQ_F_NEED_CLEANUP; io_rw_recycle(req, issue_flags); } @@ -213,60 +199,73 @@ static int io_rw_alloc_async(struct io_kiocb *req) struct io_ring_ctx *ctx = req->ctx; struct io_async_rw *rw; - rw = io_alloc_cache_get(&ctx->rw_cache); - if (rw) { - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); - req->flags |= REQ_F_NEED_CLEANUP; - } - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = rw; - goto done; - } + rw = io_uring_alloc_async_data(&ctx->rw_cache, req); + if (!rw) + return -ENOMEM; + if (rw->vec.iovec) + req->flags |= REQ_F_NEED_CLEANUP; + rw->bytes_done = 0; + return 0; +} - if (!io_alloc_async_data(req)) { - rw = req->async_data; - rw->free_iovec = NULL; - rw->free_iov_nr = 0; -done: - rw->bytes_done = 0; - return 0; - } +static inline void io_meta_save_state(struct io_async_rw *io) +{ + io->meta_state.seed = io->meta.seed; + iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta); +} - return -ENOMEM; +static inline void io_meta_restore(struct io_async_rw *io, struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_HAS_METADATA) { + io->meta.seed = io->meta_state.seed; + iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta); + } } -static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import) +static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, + u64 attr_ptr, u64 attr_type_mask) { - struct io_async_rw *rw; + struct io_uring_attr_pi pi_attr; + struct io_async_rw *io; int ret; - if (io_rw_alloc_async(req)) - return -ENOMEM; + if (copy_from_user(&pi_attr, u64_to_user_ptr(attr_ptr), + sizeof(pi_attr))) + return -EFAULT; - if (!do_import || io_do_buffer_select(req)) - return 0; + if (pi_attr.rsvd) + return -EINVAL; - rw = req->async_data; - ret = io_import_iovec(ddir, req, rw, 0); + io = req->async_data; + io->meta.flags = pi_attr.flags; + io->meta.app_tag = pi_attr.app_tag; + io->meta.seed = pi_attr.seed; + ret = import_ubuf(ddir, u64_to_user_ptr(pi_attr.addr), + pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; - - iov_iter_save_state(&rw->iter, &rw->iter_state); - return 0; + req->flags |= REQ_F_HAS_METADATA; + io_meta_save_state(io); + return ret; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, - int ddir, bool do_import) +static int __io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io; unsigned ioprio; + u64 attr_type_mask; int ret; + if (io_rw_alloc_async(req)) + return -ENOMEM; + io = req->async_data; + rw->kiocb.ki_pos = READ_ONCE(sqe->off); /* used for fixed read/write too - just read unconditionally */ req->buf_index = READ_ONCE(sqe->buf_index); + io->buf_group = req->buf_index; ioprio = READ_ONCE(sqe->ioprio); if (ioprio) { @@ -279,33 +278,71 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, rw->kiocb.ki_ioprio = get_current_ioprio(); } rw->kiocb.dio_complete = NULL; + rw->kiocb.ki_flags = 0; + rw->kiocb.ki_write_stream = READ_ONCE(sqe->write_stream); + + if (req->ctx->flags & IORING_SETUP_IOPOLL) + rw->kiocb.ki_complete = io_complete_rw_iopoll; + else + rw->kiocb.ki_complete = io_complete_rw; rw->addr = READ_ONCE(sqe->addr); rw->len = READ_ONCE(sqe->len); rw->flags = READ_ONCE(sqe->rw_flags); - return io_prep_rw_setup(req, ddir, do_import); + + attr_type_mask = READ_ONCE(sqe->attr_type_mask); + if (attr_type_mask) { + u64 attr_ptr; + + /* only PI attribute is supported currently */ + if (attr_type_mask != IORING_RW_ATTR_FLAG_PI) + return -EINVAL; + + attr_ptr = READ_ONCE(sqe->attr_ptr); + return io_prep_rw_pi(req, rw, ddir, attr_ptr, attr_type_mask); + } + return 0; +} + +static int io_rw_do_import(struct io_kiocb *req, int ddir) +{ + if (io_do_buffer_select(req)) + return 0; + + return io_import_rw_buffer(ddir, req, req->async_data, 0); +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + int ddir) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ddir); + if (unlikely(ret)) + return ret; + + return io_rw_do_import(req, ddir); } int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_DEST, true); + return io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw(req, sqe, ITER_SOURCE, true); + return io_prep_rw(req, sqe, ITER_SOURCE); } static int io_prep_rwv(struct io_kiocb *req, const struct io_uring_sqe *sqe, int ddir) { - const bool do_import = !(req->flags & REQ_F_BUFFER_SELECT); int ret; - ret = io_prep_rw(req, sqe, ddir, do_import); + ret = io_prep_rw(req, sqe, ddir); if (unlikely(ret)) return ret; - if (do_import) + if (!(req->flags & REQ_F_BUFFER_SELECT)) return 0; /* @@ -325,39 +362,77 @@ int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_prep_rwv(req, sqe, ITER_SOURCE); } -static int io_prep_rw_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_init_rw_fixed(struct io_kiocb *req, unsigned int issue_flags, int ddir) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); - struct io_ring_ctx *ctx = req->ctx; - struct io_async_rw *io; - u16 index; + struct io_async_rw *io = req->async_data; int ret; - ret = io_prep_rw(req, sqe, ddir, false); - if (unlikely(ret)) - return ret; - - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); + if (io->bytes_done) + return 0; - io = req->async_data; - ret = io_import_fixed(ddir, &io->iter, req->imu, rw->addr, rw->len); + ret = io_import_reg_buf(req, &io->iter, rw->addr, rw->len, ddir, + issue_flags); iov_iter_save_state(&io->iter, &io->iter_state); return ret; } int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_DEST); + return __io_prep_rw(req, sqe, ITER_DEST); } int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - return io_prep_rw_fixed(req, sqe, ITER_SOURCE); + return __io_prep_rw(req, sqe, ITER_SOURCE); +} + +static int io_rw_import_reg_vec(struct io_kiocb *req, + struct io_async_rw *io, + int ddir, unsigned int issue_flags) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + unsigned uvec_segs = rw->len; + int ret; + + ret = io_import_reg_vec(ddir, &io->iter, req, &io->vec, + uvec_segs, issue_flags); + if (unlikely(ret)) + return ret; + iov_iter_save_state(&io->iter, &io->iter_state); + req->flags &= ~REQ_F_IMPORT_BUFFER; + return 0; +} + +static int io_rw_prep_reg_vec(struct io_kiocb *req) +{ + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + struct io_async_rw *io = req->async_data; + const struct iovec __user *uvec; + + uvec = u64_to_user_ptr(rw->addr); + return io_prep_reg_iovec(req, &io->vec, uvec, rw->len); +} + +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_DEST); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); +} + +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int ret; + + ret = __io_prep_rw(req, sqe, ITER_SOURCE); + if (unlikely(ret)) + return ret; + return io_rw_prep_reg_vec(req); } /* @@ -373,7 +448,7 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!(req->flags & REQ_F_BUFFER_SELECT)) return -EINVAL; - ret = io_prep_rw(req, sqe, ITER_DEST, false); + ret = __io_prep_rw(req, sqe, ITER_DEST); if (unlikely(ret)) return ret; @@ -386,7 +461,8 @@ int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) void io_readv_writev_cleanup(struct io_kiocb *req) { - io_rw_iovec_free(req->async_data); + lockdep_assert_held(&req->ctx->uring_lock); + io_rw_recycle(req, 0); } static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) @@ -406,17 +482,12 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) return NULL; } -#ifdef CONFIG_BLOCK -static void io_resubmit_prep(struct io_kiocb *req) -{ - struct io_async_rw *io = req->async_data; - - iov_iter_restore(&io->iter, &io->iter_state); -} - static bool io_rw_should_reissue(struct io_kiocb *req) { +#ifdef CONFIG_BLOCK + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); umode_t mode = file_inode(req->file)->i_mode; + struct io_async_rw *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; if (!S_ISBLK(mode) && !S_ISREG(mode)) @@ -431,23 +502,14 @@ static bool io_rw_should_reissue(struct io_kiocb *req) */ if (percpu_ref_is_dying(&ctx->refs)) return false; - /* - * Play it safe and assume not safe to re-import and reissue if we're - * not in the original thread group (or in task context). - */ - if (!same_thread_group(req->task, current) || !in_task()) - return false; + + io_meta_restore(io, &rw->kiocb); + iov_iter_restore(&io->iter, &io->iter_state); return true; -} #else -static void io_resubmit_prep(struct io_kiocb *req) -{ -} -static bool io_rw_should_reissue(struct io_kiocb *req) -{ return false; -} #endif +} static void io_req_end_write(struct io_kiocb *req) { @@ -474,22 +536,16 @@ static void io_req_io_end(struct io_kiocb *req) } } -static bool __io_complete_rw_common(struct io_kiocb *req, long res) +static void __io_complete_rw_common(struct io_kiocb *req, long res) { - if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { - /* - * Reissue will start accounting again, finish the - * current cycle. - */ - io_req_io_end(req); - req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return true; - } + if (res == req->cqe.res) + return; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; + } else { req_set_fail(req); req->cqe.res = res; } - return false; } static inline int io_fixup_rw_res(struct io_kiocb *req, long res) @@ -506,7 +562,7 @@ static inline int io_fixup_rw_res(struct io_kiocb *req, long res) return res; } -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; @@ -523,7 +579,7 @@ void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts) req->cqe.flags |= io_put_kbuf(req, req->cqe.res, 0); io_req_rw_cleanup(req, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } static void io_complete_rw(struct kiocb *kiocb, long res) @@ -532,8 +588,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res) struct io_kiocb *req = cmd_to_io_kiocb(rw); if (!kiocb->dio_complete || !(kiocb->ki_flags & IOCB_DIO_CALLER_COMP)) { - if (__io_complete_rw_common(req, res)) - return; + __io_complete_rw_common(req, res); io_req_set_res(req, io_fixup_rw_res(req, res), 0); } req->io_task_work.func = io_req_rw_complete; @@ -548,19 +603,20 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); if (unlikely(res != req->cqe.res)) { - if (res == -EAGAIN && io_rw_should_reissue(req)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) req->flags |= REQ_F_REISSUE | REQ_F_BL_NO_RECYCLE; - return; - } - req->cqe.res = res; + else + req->cqe.res = res; } /* order with io_iopoll_complete() checking ->iopoll_completed */ smp_store_release(&req->iopoll_completed, 1); } -static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +static inline void io_rw_done(struct io_kiocb *req, ssize_t ret) { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + /* IO was queued async, completion will happen later */ if (ret == -EIOCBQUEUED) return; @@ -582,8 +638,10 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } - INDIRECT_CALL_2(kiocb->ki_complete, io_complete_rw_iopoll, - io_complete_rw, kiocb, ret); + if (req->ctx->flags & IORING_SETUP_IOPOLL) + io_complete_rw_iopoll(&rw->kiocb, ret); + else + io_complete_rw(&rw->kiocb, ret); } static int kiocb_done(struct io_kiocb *req, ssize_t ret, @@ -594,27 +652,20 @@ static int kiocb_done(struct io_kiocb *req, ssize_t ret, if (ret >= 0 && req->flags & REQ_F_CUR_POS) req->file->f_pos = rw->kiocb.ki_pos; - if (ret >= 0 && (rw->kiocb.ki_complete == io_complete_rw)) { - if (!__io_complete_rw_common(req, ret)) { - /* - * Safe to call io_end from here as we're inline - * from the submission path. - */ - io_req_io_end(req); - io_req_set_res(req, final_ret, - io_put_kbuf(req, ret, issue_flags)); - io_req_rw_cleanup(req, issue_flags); - return IOU_OK; - } + if (ret >= 0 && !(req->ctx->flags & IORING_SETUP_IOPOLL)) { + __io_complete_rw_common(req, ret); + /* + * Safe to call io_end from here as we're inline + * from the submission path. + */ + io_req_io_end(req); + io_req_set_res(req, final_ret, io_put_kbuf(req, ret, issue_flags)); + io_req_rw_cleanup(req, issue_flags); + return IOU_COMPLETE; } else { - io_rw_done(&rw->kiocb, ret); + io_rw_done(req, ret); } - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - io_resubmit_prep(req); - return -EAGAIN; - } return IOU_ISSUE_SKIP_COMPLETE; } @@ -629,6 +680,7 @@ static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) */ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) { + struct io_kiocb *req = cmd_to_io_kiocb(rw); struct kiocb *kiocb = &rw->kiocb; struct file *file = kiocb->ki_filp; ssize_t ret = 0; @@ -644,6 +696,8 @@ static ssize_t loop_rw_iter(int ddir, struct io_rw *rw, struct iov_iter *iter) if ((kiocb->ki_flags & IOCB_NOWAIT) && !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; + if ((req->flags & REQ_F_BUF_NODE) && req->buf_node->buf->is_kbuf) + return -EFAULT; ppos = io_kiocb_ppos(kiocb); @@ -737,8 +791,11 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ @@ -813,15 +870,30 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (ctx->flags & IORING_SETUP_IOPOLL) { if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; - kiocb->private = NULL; kiocb->ki_flags |= IOCB_HIPRI; - kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once*/ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; - kiocb->ki_complete = io_complete_rw; + } + + if (req->flags & REQ_F_HAS_METADATA) { + struct io_async_rw *io = req->async_data; + + /* + * We have a union of meta fields with wpq used for buffered-io + * in io_async_rw, so fail it here. + */ + if (!(req->file->f_flags & O_DIRECT)) + return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; + kiocb->private = &io->meta; } return 0; @@ -836,8 +908,12 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret; loff_t *ppos; - if (io_do_buffer_select(req)) { - ret = io_import_iovec(ITER_DEST, req, io, issue_flags); + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_DEST, issue_flags); + if (unlikely(ret)) + return ret; + } else if (io_do_buffer_select(req)) { + ret = io_import_rw_buffer(ITER_DEST, req, io, issue_flags); if (unlikely(ret < 0)) return ret; } @@ -872,8 +948,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret == -EOPNOTSUPP && force_nonblock) ret = -EAGAIN; - if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { - req->flags &= ~REQ_F_REISSUE; + if (ret == -EAGAIN) { /* If we can poll, just do that. */ if (io_file_can_poll(req)) return -EAGAIN; @@ -887,7 +962,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) } else if (ret == -EIOCBQUEUED) { return IOU_ISSUE_SKIP_COMPLETE; } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) { + (req->flags & REQ_F_NOWAIT) || !need_complete_io(req) || + (issue_flags & IO_URING_F_MULTISHOT)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -898,6 +974,7 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags) * manually if we need to. */ iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); do { /* @@ -959,6 +1036,8 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) if (!io_file_can_poll(req)) return -EBADFD; + /* make it sync, multishot doesn't support async execution */ + rw->kiocb.ki_complete = NULL; ret = __io_read(req, issue_flags); /* @@ -972,13 +1051,13 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ if (io_kbuf_recycle(req, issue_flags)) rw->len = 0; - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_ISSUE_SKIP_COMPLETE; - return -EAGAIN; + return IOU_RETRY; } else if (ret <= 0) { io_kbuf_recycle(req, issue_flags); if (ret < 0) req_set_fail(req); + } else if (!(req->flags & REQ_F_APOLL_MULTISHOT)) { + cflags = io_put_kbuf(req, ret, issue_flags); } else { /* * Any successful return value will keep the multishot read @@ -990,16 +1069,15 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) rw->len = 0; /* similarly to above, reset len to 0 */ if (io_req_post_cqe(req, ret, cflags | IORING_CQE_F_MORE)) { - if (issue_flags & IO_URING_F_MULTISHOT) { + if (issue_flags & IO_URING_F_MULTISHOT) /* * Force retry, as we might have more data to * be read and otherwise it won't get retried * until (if ever) another poll is triggered. */ io_poll_multishot_retry(req); - return IOU_ISSUE_SKIP_COMPLETE; - } - return -EAGAIN; + + return IOU_RETRY; } } @@ -1009,9 +1087,26 @@ int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags) */ io_req_set_res(req, ret, cflags); io_req_rw_cleanup(req, issue_flags); - if (issue_flags & IO_URING_F_MULTISHOT) - return IOU_STOP_MULTISHOT; - return IOU_OK; + return IOU_COMPLETE; +} + +static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) +{ + struct inode *inode; + bool ret; + + if (!(req->flags & REQ_F_ISREG)) + return true; + if (!(kiocb->ki_flags & IOCB_NOWAIT)) { + kiocb_start_write(kiocb); + return true; + } + + inode = file_inode(kiocb->ki_filp); + ret = sb_start_write_trylock(inode->i_sb); + if (ret) + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + return ret; } int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -1023,6 +1118,12 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ssize_t ret, ret2; loff_t *ppos; + if (req->flags & REQ_F_IMPORT_BUFFER) { + ret = io_rw_import_reg_vec(req, io, ITER_SOURCE, issue_flags); + if (unlikely(ret)) + return ret; + } + ret = io_rw_init_file(req, FMODE_WRITE, WRITE); if (unlikely(ret)) return ret; @@ -1051,8 +1152,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(ret)) return ret; - if (req->flags & REQ_F_ISREG) - kiocb_start_write(kiocb); + if (unlikely(!io_kiocb_start_write(req, kiocb))) + return -EAGAIN; kiocb->ki_flags |= IOCB_WRITE; if (likely(req->file->f_op->write_iter)) @@ -1062,11 +1163,6 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) else ret2 = -EINVAL; - if (req->flags & REQ_F_REISSUE) { - req->flags &= ~REQ_F_REISSUE; - ret2 = -EAGAIN; - } - /* * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just * retry them without IOCB_NOWAIT. @@ -1102,12 +1198,35 @@ done: } else { ret_eagain: iov_iter_restore(&io->iter, &io->iter_state); + io_meta_restore(io, kiocb); if (kiocb->ki_flags & IOCB_WRITE) io_req_end_write(req); return -EAGAIN; } } +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_DEST); + if (unlikely(ret)) + return ret; + + return io_read(req, issue_flags); +} + +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags) +{ + int ret; + + ret = io_init_rw_fixed(req, issue_flags, ITER_SOURCE); + if (unlikely(ret)) + return ret; + + return io_write(req, issue_flags); +} + void io_rw_fail(struct io_kiocb *req) { int res; @@ -1116,6 +1235,78 @@ void io_rw_fail(struct io_kiocb *req) io_req_set_res(req, res, req->cqe.flags); } +static int io_uring_classic_poll(struct io_kiocb *req, struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct file *file = req->file; + + if (req->opcode == IORING_OP_URING_CMD) { + struct io_uring_cmd *ioucmd; + + ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + return file->f_op->uring_cmd_iopoll(ioucmd, iob, poll_flags); + } else { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + return file->f_op->iopoll(&rw->kiocb, iob, poll_flags); + } +} + +static u64 io_hybrid_iopoll_delay(struct io_ring_ctx *ctx, struct io_kiocb *req) +{ + struct hrtimer_sleeper timer; + enum hrtimer_mode mode; + ktime_t kt; + u64 sleep_time; + + if (req->flags & REQ_F_IOPOLL_STATE) + return 0; + + if (ctx->hybrid_poll_time == LLONG_MAX) + return 0; + + /* Using half the running time to do schedule */ + sleep_time = ctx->hybrid_poll_time / 2; + + kt = ktime_set(0, sleep_time); + req->flags |= REQ_F_IOPOLL_STATE; + + mode = HRTIMER_MODE_REL; + hrtimer_setup_sleeper_on_stack(&timer, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&timer.timer, kt); + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_sleeper_start_expires(&timer, mode); + + if (timer.task) + io_schedule(); + + hrtimer_cancel(&timer.timer); + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&timer.timer); + return sleep_time; +} + +static int io_uring_hybrid_poll(struct io_kiocb *req, + struct io_comp_batch *iob, unsigned int poll_flags) +{ + struct io_ring_ctx *ctx = req->ctx; + u64 runtime, sleep_time; + int ret; + + sleep_time = io_hybrid_iopoll_delay(ctx, req); + ret = io_uring_classic_poll(req, iob, poll_flags); + runtime = ktime_get_ns() - req->iopoll_start - sleep_time; + + /* + * Use minimum sleep time if we're polling devices with different + * latencies. We could get more completions from the faster ones. + */ + if (ctx->hybrid_poll_time > runtime) + ctx->hybrid_poll_time = runtime; + + return ret; +} + int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { struct io_wq_work_node *pos, *start, *prev; @@ -1132,7 +1323,6 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) wq_list_for_each(pos, start, &ctx->iopoll_list) { struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); - struct file *file = req->file; int ret; /* @@ -1143,29 +1333,23 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) if (READ_ONCE(req->iopoll_completed)) break; - if (req->opcode == IORING_OP_URING_CMD) { - struct io_uring_cmd *ioucmd; - - ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - ret = file->f_op->uring_cmd_iopoll(ioucmd, &iob, - poll_flags); - } else { - struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) + ret = io_uring_hybrid_poll(req, &iob, poll_flags); + else + ret = io_uring_classic_poll(req, &iob, poll_flags); - ret = file->f_op->iopoll(&rw->kiocb, &iob, poll_flags); - } if (unlikely(ret < 0)) return ret; else if (ret) poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (!rq_list_empty(iob.req_list) || + if (!rq_list_empty(&iob.req_list) || READ_ONCE(req->iopoll_completed)) break; } - if (!rq_list_empty(iob.req_list)) + if (!rq_list_empty(&iob.req_list)) iob.complete(&iob); else if (!pos) return 0; @@ -1199,10 +1383,6 @@ void io_rw_cache_free(const void *entry) { struct io_async_rw *rw = (struct io_async_rw *) entry; - if (rw->free_iovec) { - kasan_mempool_unpoison_object(rw->free_iovec, - rw->free_iov_nr * sizeof(struct iovec)); - io_rw_iovec_free(rw); - } + io_vec_free(&rw->vec); kfree(rw); } diff --git a/io_uring/rw.h b/io_uring/rw.h index 3f432dc75441..129a53fe5482 100644 --- a/io_uring/rw.h +++ b/io_uring/rw.h @@ -1,28 +1,52 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/io_uring_types.h> #include <linux/pagemap.h> +struct io_meta_state { + u32 seed; + struct iov_iter_state iter_meta; +}; + struct io_async_rw { + struct iou_vec vec; size_t bytes_done; - struct iov_iter iter; - struct iov_iter_state iter_state; - struct iovec fast_iov; - struct iovec *free_iovec; - int free_iov_nr; - struct wait_page_queue wpq; + + struct_group(clear, + struct iov_iter iter; + struct iov_iter_state iter_state; + struct iovec fast_iov; + unsigned buf_group; + + /* + * wpq is for buffered io, while meta fields are used with + * direct io + */ + union { + struct wait_page_queue wpq; + struct { + struct uio_meta meta; + struct io_meta_state meta_state; + }; + }; + ); }; int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_readv_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); +int io_prep_writev_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_readv(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_writev(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_prep_write(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read(struct io_kiocb *req, unsigned int issue_flags); int io_write(struct io_kiocb *req, unsigned int issue_flags); +int io_read_fixed(struct io_kiocb *req, unsigned int issue_flags); +int io_write_fixed(struct io_kiocb *req, unsigned int issue_flags); void io_readv_writev_cleanup(struct io_kiocb *req); void io_rw_fail(struct io_kiocb *req); -void io_req_rw_complete(struct io_kiocb *req, struct io_tw_state *ts); +void io_req_rw_complete(struct io_kiocb *req, io_tw_token_t tw); int io_read_mshot_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_read_mshot(struct io_kiocb *req, unsigned int issue_flags); void io_rw_cache_free(const void *entry); diff --git a/io_uring/splice.c b/io_uring/splice.c index 3b659cd23e9d..35ce4e60b495 100644 --- a/io_uring/splice.c +++ b/io_uring/splice.c @@ -21,6 +21,7 @@ struct io_splice { u64 len; int splice_fd_in; unsigned int flags; + struct io_rsrc_node *rsrc_node; }; static int __io_splice_prep(struct io_kiocb *req, @@ -34,6 +35,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; sp->splice_fd_in = READ_ONCE(sqe->splice_fd_in); + sp->rsrc_node = NULL; req->flags |= REQ_F_FORCE_ASYNC; return 0; } @@ -45,6 +47,37 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_splice_prep(req, sqe); } +void io_splice_cleanup(struct io_kiocb *req) +{ + struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); + + if (sp->rsrc_node) + io_put_rsrc_node(req->ctx, sp->rsrc_node); +} + +static struct file *io_splice_get_file(struct io_kiocb *req, + unsigned int issue_flags) +{ + struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); + struct io_ring_ctx *ctx = req->ctx; + struct io_rsrc_node *node; + struct file *file = NULL; + + if (!(sp->flags & SPLICE_F_FD_IN_FIXED)) + return io_file_get_normal(req, sp->splice_fd_in); + + io_ring_submit_lock(ctx, issue_flags); + node = io_rsrc_node_lookup(&ctx->file_table.data, sp->splice_fd_in); + if (node) { + node->refs++; + sp->rsrc_node = node; + file = io_slot_file(node); + req->flags |= REQ_F_NEED_CLEANUP; + } + io_ring_submit_unlock(ctx, issue_flags); + return file; +} + int io_tee(struct io_kiocb *req, unsigned int issue_flags) { struct io_splice *sp = io_kiocb_to_cmd(req, struct io_splice); @@ -55,10 +88,7 @@ int io_tee(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); + in = io_splice_get_file(req, issue_flags); if (!in) { ret = -EBADF; goto done; @@ -73,7 +103,7 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -96,10 +126,7 @@ int io_splice(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - if (sp->flags & SPLICE_F_FD_IN_FIXED) - in = io_file_get_fixed(req, sp->splice_fd_in, issue_flags); - else - in = io_file_get_normal(req, sp->splice_fd_in); + in = io_splice_get_file(req, issue_flags); if (!in) { ret = -EBADF; goto done; @@ -117,5 +144,5 @@ done: if (ret != sp->len) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/splice.h b/io_uring/splice.h index 542f94168ad3..b9b2848327fb 100644 --- a/io_uring/splice.h +++ b/io_uring/splice.h @@ -3,5 +3,6 @@ int io_tee_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_tee(struct io_kiocb *req, unsigned int issue_flags); +void io_splice_cleanup(struct io_kiocb *req); int io_splice_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_splice(struct io_kiocb *req, unsigned int issue_flags); diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c index a26593979887..a3f11349ce06 100644 --- a/io_uring/sqpoll.c +++ b/io_uring/sqpoll.c @@ -16,11 +16,12 @@ #include <uapi/linux/io_uring.h> #include "io_uring.h" +#include "tctx.h" #include "napi.h" #include "sqpoll.h" #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -#define IORING_TW_CAP_ENTRIES_VALUE 8 +#define IORING_TW_CAP_ENTRIES_VALUE 32 enum { IO_SQ_THREAD_SHOULD_STOP = 0, @@ -30,7 +31,7 @@ enum { void io_sq_thread_unpark(struct io_sq_data *sqd) __releases(&sqd->lock) { - WARN_ON_ONCE(sqd->thread == current); + WARN_ON_ONCE(sqpoll_task_locked(sqd) == current); /* * Do the dance but not conditional clear_bit() because it'd race with @@ -40,29 +41,38 @@ void io_sq_thread_unpark(struct io_sq_data *sqd) if (atomic_dec_return(&sqd->park_pending)) set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_unlock(&sqd->lock); + wake_up(&sqd->wait); } void io_sq_thread_park(struct io_sq_data *sqd) __acquires(&sqd->lock) { - WARN_ON_ONCE(data_race(sqd->thread) == current); + struct task_struct *tsk; atomic_inc(&sqd->park_pending); set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } } void io_sq_thread_stop(struct io_sq_data *sqd) { - WARN_ON_ONCE(sqd->thread == current); + struct task_struct *tsk; + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state)); set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); mutex_lock(&sqd->lock); - if (sqd->thread) - wake_up_process(sqd->thread); + tsk = sqpoll_task_locked(sqd); + if (tsk) { + WARN_ON_ONCE(tsk == current); + wake_up_process(tsk); + } mutex_unlock(&sqd->lock); wait_for_completion(&sqd->exited); } @@ -106,29 +116,21 @@ static struct io_sq_data *io_attach_sq_data(struct io_uring_params *p) { struct io_ring_ctx *ctx_attach; struct io_sq_data *sqd; - struct fd f; + CLASS(fd, f)(p->wq_fd); - f = fdget(p->wq_fd); - if (!fd_file(f)) + if (fd_empty(f)) return ERR_PTR(-ENXIO); - if (!io_is_uring_fops(fd_file(f))) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return ERR_PTR(-EINVAL); - } ctx_attach = fd_file(f)->private_data; sqd = ctx_attach->sq_data; - if (!sqd) { - fdput(f); + if (!sqd) return ERR_PTR(-EINVAL); - } - if (sqd->task_tgid != current->tgid) { - fdput(f); + if (sqd->task_tgid != current->tgid) return ERR_PTR(-EPERM); - } refcount_inc(&sqd->refs); - fdput(f); return sqd; } @@ -215,7 +217,7 @@ static bool io_sqd_handle_event(struct io_sq_data *sqd) mutex_unlock(&sqd->lock); if (signal_pending(current)) did_sig = get_signal(&ksig); - cond_resched(); + wait_event(sqd->wait, !atomic_read(&sqd->park_pending)); mutex_lock(&sqd->lock); sqd->sq_cpu = raw_smp_processor_id(); } @@ -271,12 +273,17 @@ static int io_sq_thread(void *data) struct io_ring_ctx *ctx; struct rusage start; unsigned long timeout = 0; - char buf[TASK_COMM_LEN]; + char buf[TASK_COMM_LEN] = {}; DEFINE_WAIT(wait); /* offload context creation failed, just exit */ - if (!current->io_uring) + if (!current->io_uring) { + mutex_lock(&sqd->lock); + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); + mutex_unlock(&sqd->lock); goto err_out; + } snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid); set_task_comm(current, buf); @@ -382,7 +389,8 @@ static int io_sq_thread(void *data) io_sq_tw(&retry_list, UINT_MAX); io_uring_cancel_generic(true, sqd); - sqd->thread = NULL; + rcu_assign_pointer(sqd->thread, NULL); + put_task_struct(current); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags); io_run_task_work(); @@ -417,16 +425,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, /* Retain compatibility with failing for an invalid attach attempt */ if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == IORING_SETUP_ATTACH_WQ) { - struct fd f; - - f = fdget(p->wq_fd); - if (!fd_file(f)) + CLASS(fd, f)(p->wq_fd); + if (fd_empty(f)) return -ENXIO; - if (!io_is_uring_fops(fd_file(f))) { - fdput(f); + if (!io_is_uring_fops(fd_file(f))) return -EINVAL; - } - fdput(f); } if (ctx->flags & IORING_SETUP_SQPOLL) { struct task_struct *tsk; @@ -491,7 +494,11 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, goto err_sqpoll; } - sqd->thread = tsk; + mutex_lock(&sqd->lock); + rcu_assign_pointer(sqd->thread, tsk); + mutex_unlock(&sqd->lock); + + get_task_struct(tsk); ret = io_uring_alloc_task_context(tsk, ctx); wake_up_new_task(tsk); if (ret) @@ -501,7 +508,6 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx, ret = -EINVAL; goto err; } - return 0; err_sqpoll: complete(&ctx->sq_data->exited); @@ -517,10 +523,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, int ret = -EINVAL; if (sqd) { + struct task_struct *tsk; + io_sq_thread_park(sqd); /* Don't set affinity for a dying thread */ - if (sqd->thread) - ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask); + tsk = sqpoll_task_locked(sqd); + if (tsk) + ret = io_wq_cpu_affinity(tsk->io_uring, mask); io_sq_thread_unpark(sqd); } diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h index 4171666b1cf4..b83dcdec9765 100644 --- a/io_uring/sqpoll.h +++ b/io_uring/sqpoll.h @@ -8,7 +8,7 @@ struct io_sq_data { /* ctx's that are using this sqd */ struct list_head ctx_list; - struct task_struct *thread; + struct task_struct __rcu *thread; struct wait_queue_head wait; unsigned sq_thread_idle; @@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd); void io_put_sq_data(struct io_sq_data *sqd); void io_sqpoll_wait_sq(struct io_ring_ctx *ctx); int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask); + +static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd) +{ + return rcu_dereference_protected(sqd->thread, + lockdep_is_held(&sqd->lock)); +} diff --git a/io_uring/statx.c b/io_uring/statx.c index f7f9b202eec0..5111e9befbfe 100644 --- a/io_uring/statx.c +++ b/io_uring/statx.c @@ -36,8 +36,7 @@ int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); - sx->filename = getname_flags(path, - getname_statx_lookup_flags(sx->flags)); + sx->filename = getname_uflags(path, sx->flags); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); @@ -60,7 +59,7 @@ int io_statx(struct io_kiocb *req, unsigned int issue_flags) ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } void io_statx_cleanup(struct io_kiocb *req) diff --git a/io_uring/sync.c b/io_uring/sync.c index 255f68c37e55..cea2d381ffd2 100644 --- a/io_uring/sync.c +++ b/io_uring/sync.c @@ -47,7 +47,7 @@ int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) ret = sync_file_range(req->file, sync->off, sync->len, sync->flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -79,7 +79,7 @@ int io_fsync(struct io_kiocb *req, unsigned int issue_flags) ret = vfs_fsync_range(req->file, sync->off, end > 0 ? end : LLONG_MAX, sync->flags & IORING_FSYNC_DATASYNC); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -108,5 +108,5 @@ int io_fallocate(struct io_kiocb *req, unsigned int issue_flags) if (ret >= 0) fsnotify_modify(req->file); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/tctx.c b/io_uring/tctx.c index c043fe93a3f2..5b66755579c0 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -35,8 +35,6 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_wq_free_work; - data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); @@ -47,8 +45,19 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, void __io_uring_free(struct task_struct *tsk) { struct io_uring_task *tctx = tsk->io_uring; + struct io_tctx_node *node; + unsigned long index; - WARN_ON_ONCE(!xa_empty(&tctx->xa)); + /* + * Fault injection forcing allocation errors in the xa_store() path + * can lead to xa_empty() returning false, even though no actual + * node is stored in the xarray. Until that gets sorted out, attempt + * an iteration here and warn if any entries are found. + */ + xa_for_each(&tctx->xa, index, node) { + WARN_ON_ONCE(1); + break; + } WARN_ON_ONCE(tctx->io_wq); WARN_ON_ONCE(tctx->cached_refs); @@ -81,6 +90,7 @@ __cold int io_uring_alloc_task_context(struct task_struct *task, return ret; } + tctx->task = task; xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); atomic_set(&tctx->in_cancel, 0); diff --git a/io_uring/timeout.c b/io_uring/timeout.c index 9973876d91b0..7f13bfa9f2b6 100644 --- a/io_uring/timeout.c +++ b/io_uring/timeout.c @@ -35,6 +35,9 @@ struct io_timeout_rem { bool ltimeout; }; +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link); + static inline bool io_is_timeout_noseq(struct io_kiocb *req) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); @@ -65,7 +68,7 @@ static inline bool io_timeout_finish(struct io_timeout *timeout, static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer); -static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) +static void io_timeout_complete(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_timeout_data *data = req->async_data; @@ -74,19 +77,38 @@ static void io_timeout_complete(struct io_kiocb *req, struct io_tw_state *ts) if (!io_timeout_finish(timeout, data)) { if (io_req_post_cqe(req, -ETIME, IORING_CQE_F_MORE)) { /* re-arm timer */ - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_add(&timeout->list, ctx->timeout_list.prev); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return; } } - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } -static bool io_kill_timeout(struct io_kiocb *req, int status) +static __cold bool io_flush_killed_timeouts(struct list_head *list, int err) +{ + if (list_empty(list)) + return false; + + while (!list_empty(list)) { + struct io_timeout *timeout; + struct io_kiocb *req; + + timeout = list_first_entry(list, struct io_timeout, list); + list_del_init(&timeout->list); + req = cmd_to_io_kiocb(timeout); + if (err) + req_set_fail(req); + io_req_queue_tw_complete(req, err); + } + + return true; +} + +static void io_kill_timeout(struct io_kiocb *req, struct list_head *list) __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -94,23 +116,19 @@ static bool io_kill_timeout(struct io_kiocb *req, int status) if (hrtimer_try_to_cancel(&io->timer) != -1) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); - if (status) - req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - list_del_init(&timeout->list); - io_req_queue_tw_complete(req, status); - return true; + list_move_tail(&timeout->list, list); } - return false; } __cold void io_flush_timeouts(struct io_ring_ctx *ctx) { - u32 seq; struct io_timeout *timeout, *tmp; + LIST_HEAD(list); + u32 seq; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { @@ -132,15 +150,16 @@ __cold void io_flush_timeouts(struct io_ring_ctx *ctx) if (events_got < events_needed) break; - io_kill_timeout(req, 0); + io_kill_timeout(req, &list); } ctx->cq_last_tm_flush = seq; - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); + io_flush_killed_timeouts(&list, 0); } -static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) +static void io_req_tw_fail_links(struct io_kiocb *link, io_tw_token_t tw) { - io_tw_lock(link->ctx, ts); + io_tw_lock(link->ctx, tw); while (link) { struct io_kiocb *nxt = link->link; long res = -ECANCELED; @@ -149,7 +168,7 @@ static void io_req_tw_fail_links(struct io_kiocb *link, struct io_tw_state *ts) res = link->cqe.res; link->link = NULL; io_req_set_res(link, res, 0); - io_req_task_complete(link, ts); + io_req_task_complete(link, tw); link = nxt; } } @@ -201,9 +220,11 @@ void io_disarm_next(struct io_kiocb *req) } else if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); - link = io_disarm_linked_timeout(req); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); + if (req->link && req->link->opcode == IORING_OP_LINK_TIMEOUT) + link = __io_disarm_linked_timeout(req, req->link); + + raw_spin_unlock_irq(&ctx->timeout_lock); if (link) io_req_queue_tw_complete(link, -ECANCELED); } @@ -212,8 +233,8 @@ void io_disarm_next(struct io_kiocb *req) io_fail_links(req); } -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link) +static struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, + struct io_kiocb *link) __must_hold(&req->ctx->completion_lock) __must_hold(&req->ctx->timeout_lock) { @@ -239,11 +260,11 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&timeout->list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) req_set_fail(req); @@ -286,9 +307,9 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) { struct io_kiocb *req; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); req = io_timeout_extract(ctx, cd); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); if (IS_ERR(req)) return PTR_ERR(req); @@ -296,27 +317,29 @@ int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd) return 0; } -static void io_req_task_link_timeout(struct io_kiocb *req, struct io_tw_state *ts) +static void io_req_task_link_timeout(struct io_kiocb *req, io_tw_token_t tw) { struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_kiocb *prev = timeout->prev; - int ret = -ENOENT; + int ret; if (prev) { - if (!(req->task->flags & PF_EXITING)) { + if (!io_should_terminate_tw()) { struct io_cancel_data cd = { .ctx = req->ctx, .data = prev->cqe.user_data, }; - ret = io_try_cancel(req->task->io_uring, &cd, 0); + ret = io_try_cancel(req->tctx, &cd, 0); + } else { + ret = -ECANCELED; } io_req_set_res(req, ret ?: -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); io_put_req(prev); } else { io_req_set_res(req, -ETIME, 0); - io_req_task_complete(req, ts); + io_req_task_complete(req, tw); } } @@ -329,7 +352,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->timeout_lock, flags); + raw_spin_lock_irqsave(&ctx->timeout_lock, flags); prev = timeout->head; timeout->head = NULL; @@ -344,7 +367,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) } list_del(&timeout->list); timeout->prev = prev; - spin_unlock_irqrestore(&ctx->timeout_lock, flags); + raw_spin_unlock_irqrestore(&ctx->timeout_lock, flags); req->io_task_work.func = io_req_task_link_timeout; io_req_task_work_add(req); @@ -389,8 +412,7 @@ static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) == -1) return -EALREADY; - hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); - io->timer.function = io_link_timeout_fn; + hrtimer_setup(&io->timer, io_link_timeout_fn, io_timeout_get_clock(io), mode); hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } @@ -409,10 +431,11 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, timeout->off = 0; /* noseq */ data = req->async_data; + data->ts = *ts; + list_add_tail(&timeout->list, &ctx->timeout_list); - hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); - data->timer.function = io_timeout_fn; - hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), mode); + hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), mode); return 0; } @@ -471,18 +494,18 @@ int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) } else { enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); if (tr->ltimeout) ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); else ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); } if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } static int __io_timeout_prep(struct io_kiocb *req, @@ -524,10 +547,9 @@ static int __io_timeout_prep(struct io_kiocb *req, if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; - if (io_alloc_async_data(req)) + data = io_uring_alloc_async_data(NULL, req); + if (!data) return -ENOMEM; - - data = req->async_data; data->req = req; data->flags = flags; @@ -538,7 +560,6 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); if (is_timeout_link) { struct io_submit_link *link = &req->ctx->submit_state.link; @@ -549,6 +570,10 @@ static int __io_timeout_prep(struct io_kiocb *req, return -EINVAL; timeout->head = link->last; link->last->flags |= REQ_F_ARM_LTIMEOUT; + hrtimer_setup(&data->timer, io_link_timeout_fn, io_timeout_get_clock(data), + data->mode); + } else { + hrtimer_setup(&data->timer, io_timeout_fn, io_timeout_get_clock(data), data->mode); } return 0; } @@ -571,7 +596,7 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = timeout->off; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -608,9 +633,8 @@ int io_timeout(struct io_kiocb *req, unsigned int issue_flags) } add: list_add(&timeout->list, entry); - data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); return IOU_ISSUE_SKIP_COMPLETE; } @@ -619,7 +643,7 @@ void io_queue_linked_timeout(struct io_kiocb *req) struct io_timeout *timeout = io_kiocb_to_cmd(req, struct io_timeout); struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -627,23 +651,22 @@ void io_queue_linked_timeout(struct io_kiocb *req) if (timeout->head) { struct io_timeout_data *data = req->async_data; - data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); list_add_tail(&timeout->list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } -static bool io_match_task(struct io_kiocb *head, struct task_struct *task, +static bool io_match_task(struct io_kiocb *head, struct io_uring_task *tctx, bool cancel_all) __must_hold(&head->ctx->timeout_lock) { struct io_kiocb *req; - if (task && head->task != task) + if (tctx && head->tctx != tctx) return false; if (cancel_all) return true; @@ -656,26 +679,26 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, } /* Returns true if we found and killed one or more timeouts */ -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { struct io_timeout *timeout, *tmp; - int canceled = 0; + LIST_HEAD(list); /* * completion_lock is needed for io_match_task(). Take it before * timeout_lockfirst to keep locking ordering. */ spin_lock(&ctx->completion_lock); - spin_lock_irq(&ctx->timeout_lock); + raw_spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(timeout, tmp, &ctx->timeout_list, list) { struct io_kiocb *req = cmd_to_io_kiocb(timeout); - if (io_match_task(req, tsk, cancel_all) && - io_kill_timeout(req, -ECANCELED)) - canceled++; + if (io_match_task(req, tctx, cancel_all)) + io_kill_timeout(req, &list); } - spin_unlock_irq(&ctx->timeout_lock); + raw_spin_unlock_irq(&ctx->timeout_lock); spin_unlock(&ctx->completion_lock); - return canceled != 0; + + return io_flush_killed_timeouts(&list, -ECANCELED); } diff --git a/io_uring/timeout.h b/io_uring/timeout.h index a6939f18313e..2b7c9ad72992 100644 --- a/io_uring/timeout.h +++ b/io_uring/timeout.h @@ -8,23 +8,10 @@ struct io_timeout_data { u32 flags; }; -struct io_kiocb *__io_disarm_linked_timeout(struct io_kiocb *req, - struct io_kiocb *link); - -static inline struct io_kiocb *io_disarm_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *link = req->link; - - if (link && link->opcode == IORING_OP_LINK_TIMEOUT) - return __io_disarm_linked_timeout(req, link); - - return NULL; -} - __cold void io_flush_timeouts(struct io_ring_ctx *ctx); struct io_cancel_data; int io_timeout_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd); -__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, +__cold bool io_kill_timeouts(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); void io_queue_linked_timeout(struct io_kiocb *req); void io_disarm_next(struct io_kiocb *req); diff --git a/io_uring/truncate.c b/io_uring/truncate.c index 62ee73d34d72..487baf23b44e 100644 --- a/io_uring/truncate.c +++ b/io_uring/truncate.c @@ -44,5 +44,5 @@ int io_ftruncate(struct io_kiocb *req, unsigned int issue_flags) ret = do_ftruncate(req->file, ft->len, 1); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/uring_cmd.c b/io_uring/uring_cmd.c index 39c3c816ec78..929cad6ee326 100644 --- a/io_uring/uring_cmd.c +++ b/io_uring/uring_cmd.c @@ -3,51 +3,56 @@ #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring/cmd.h> -#include <linux/io_uring/net.h> #include <linux/security.h> #include <linux/nospec.h> -#include <net/sock.h> #include <uapi/linux/io_uring.h> -#include <asm/ioctls.h> #include "io_uring.h" #include "alloc_cache.h" #include "rsrc.h" #include "uring_cmd.h" -static struct uring_cache *io_uring_async_get(struct io_kiocb *req) +void io_cmd_cache_free(const void *entry) { - struct io_ring_ctx *ctx = req->ctx; - struct uring_cache *cache; + struct io_async_cmd *ac = (struct io_async_cmd *)entry; - cache = io_alloc_cache_get(&ctx->uring_cache); - if (cache) { - req->flags |= REQ_F_ASYNC_DATA; - req->async_data = cache; - return cache; - } - if (!io_alloc_async_data(req)) - return req->async_data; - return NULL; + io_vec_free(&ac->vec); + kfree(ac); } static void io_req_uring_cleanup(struct io_kiocb *req, unsigned int issue_flags) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache = req->async_data; + struct io_async_cmd *ac = req->async_data; + struct io_uring_cmd_data *cache = &ac->data; + + if (cache->op_data) { + kfree(cache->op_data); + cache->op_data = NULL; + } if (issue_flags & IO_URING_F_UNLOCKED) return; - if (io_alloc_cache_put(&req->ctx->uring_cache, cache)) { + + io_alloc_cache_vec_kasan(&ac->vec); + if (ac->vec.nr > IO_VEC_CACHE_SOFT_CAP) + io_vec_free(&ac->vec); + + if (io_alloc_cache_put(&req->ctx->cmd_cache, cache)) { ioucmd->sqe = NULL; req->async_data = NULL; - req->flags &= ~REQ_F_ASYNC_DATA; + req->flags &= ~(REQ_F_ASYNC_DATA|REQ_F_NEED_CLEANUP); } } +void io_uring_cmd_cleanup(struct io_kiocb *req) +{ + io_req_uring_cleanup(req, 0); +} + bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) + struct io_uring_task *tctx, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -61,13 +66,10 @@ bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, struct io_uring_cmd); struct file *file = req->file; - if (!cancel_all && req->task != task) + if (!cancel_all && req->tctx != tctx) continue; if (cmd->flags & IORING_URING_CMD_CANCELABLE) { - /* ->sqe isn't available if no async data */ - if (!req_has_async_data(req)) - cmd->sqe = NULL; file->f_op->uring_cmd(cmd, IO_URING_F_CANCEL | IO_URING_F_COMPLETE_DEFER); ret = true; @@ -116,12 +118,16 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, } EXPORT_SYMBOL_GPL(io_uring_cmd_mark_cancelable); -static void io_uring_cmd_work(struct io_kiocb *req, struct io_tw_state *ts) +static void io_uring_cmd_work(struct io_kiocb *req, io_tw_token_t tw) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); + unsigned int flags = IO_URING_F_COMPLETE_DEFER; + + if (io_should_terminate_tw()) + flags |= IO_URING_F_TASK_DEAD; /* task_work executor checks the deffered list completion */ - ioucmd->task_work_cb(ioucmd, IO_URING_F_COMPLETE_DEFER); + ioucmd->task_work_cb(ioucmd, flags); } void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, @@ -147,7 +153,7 @@ static inline void io_req_set_cqe32_extra(struct io_kiocb *req, * Called by consumers of io_uring_cmd, if they originally returned * -EIOCBQUEUED upon receiving the command. */ -void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2, +void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, u64 res2, unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); @@ -179,20 +185,25 @@ static int io_uring_cmd_prep_setup(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_uring_cmd *ioucmd = io_kiocb_to_cmd(req, struct io_uring_cmd); - struct uring_cache *cache; + struct io_async_cmd *ac; - cache = io_uring_async_get(req); - if (unlikely(!cache)) - return -ENOMEM; + /* see io_uring_cmd_get_async_data() */ + BUILD_BUG_ON(offsetof(struct io_async_cmd, data) != 0); - if (!(req->flags & REQ_F_FORCE_ASYNC)) { - /* defer memcpy until we need it */ - ioucmd->sqe = sqe; - return 0; - } - - memcpy(req->async_data, sqe, uring_sqe_size(req->ctx)); - ioucmd->sqe = req->async_data; + ac = io_uring_alloc_async_data(&req->ctx->cmd_cache, req); + if (!ac) + return -ENOMEM; + ac->data.op_data = NULL; + + /* + * Unconditionally cache the SQE for now - this is only needed for + * requests that go async, but prep handlers must ensure that any + * sqe data is stable beyond prep. Since uring_cmd is special in + * that it doesn't read in per-op data, play it safe and ensure that + * any SQE data is stable beyond prep. This can later get relaxed. + */ + memcpy(ac->sqes, sqe, uring_sqe_size(req->ctx)); + ioucmd->sqe = ac->sqes; return 0; } @@ -207,17 +218,9 @@ int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (ioucmd->flags & ~IORING_URING_CMD_MASK) return -EINVAL; - if (ioucmd->flags & IORING_URING_CMD_FIXED) { - struct io_ring_ctx *ctx = req->ctx; - u16 index; - + if (ioucmd->flags & IORING_URING_CMD_FIXED) req->buf_index = READ_ONCE(sqe->buf_index); - if (unlikely(req->buf_index >= ctx->nr_user_bufs)) - return -EFAULT; - index = array_index_nospec(req->buf_index, ctx->nr_user_bufs); - req->imu = ctx->user_bufs[index]; - io_req_set_rsrc_node(req, ctx, 0); - } + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op); return io_uring_cmd_prep_setup(req, sqe); @@ -241,123 +244,69 @@ int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags) issue_flags |= IO_URING_F_SQE128; if (ctx->flags & IORING_SETUP_CQE32) issue_flags |= IO_URING_F_CQE32; - if (ctx->compat) + if (io_is_compat(ctx)) issue_flags |= IO_URING_F_COMPAT; if (ctx->flags & IORING_SETUP_IOPOLL) { if (!file->f_op->uring_cmd_iopoll) return -EOPNOTSUPP; issue_flags |= IO_URING_F_IOPOLL; req->iopoll_completed = 0; + if (ctx->flags & IORING_SETUP_HYBRID_IOPOLL) { + /* make sure every req only blocks once */ + req->flags &= ~REQ_F_IOPOLL_STATE; + req->iopoll_start = ktime_get_ns(); + } } ret = file->f_op->uring_cmd(ioucmd, issue_flags); - if (ret == -EAGAIN) { - struct uring_cache *cache = req->async_data; - - if (ioucmd->sqe != (void *) cache) - memcpy(cache, ioucmd->sqe, uring_sqe_size(req->ctx)); - return -EAGAIN; - } else if (ret == -EIOCBQUEUED) { - return -EIOCBQUEUED; - } - + if (ret == -EAGAIN || ret == -EIOCBQUEUED) + return ret; if (ret < 0) req_set_fail(req); io_req_uring_cleanup(req, issue_flags); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, - struct iov_iter *iter, void *ioucmd) + struct iov_iter *iter, + struct io_uring_cmd *ioucmd, + unsigned int issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - return io_import_fixed(rw, iter, req->imu, ubuf, len); + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; + + return io_import_reg_buf(req, iter, ubuf, len, rw, issue_flags); } EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); -void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) +int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd, + const struct iovec __user *uvec, + size_t uvec_segs, + int ddir, struct iov_iter *iter, + unsigned issue_flags) { struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); + struct io_async_cmd *ac = req->async_data; + int ret; - io_req_queue_iowq(req); -} - -static inline int io_uring_cmd_getsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optlen, optname, level, err; - void __user *optval; - - level = READ_ONCE(cmd->sqe->level); - if (level != SOL_SOCKET) - return -EOPNOTSUPP; - - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); - - err = do_sock_getsockopt(sock, compat, level, optname, - USER_SOCKPTR(optval), - KERNEL_SOCKPTR(&optlen)); - if (err) - return err; + if (WARN_ON_ONCE(!(ioucmd->flags & IORING_URING_CMD_FIXED))) + return -EINVAL; - /* On success, return optlen */ - return optlen; -} + ret = io_prep_reg_iovec(req, &ac->vec, uvec, uvec_segs); + if (ret) + return ret; -static inline int io_uring_cmd_setsockopt(struct socket *sock, - struct io_uring_cmd *cmd, - unsigned int issue_flags) -{ - bool compat = !!(issue_flags & IO_URING_F_COMPAT); - int optname, optlen, level; - void __user *optval; - sockptr_t optval_s; - - optval = u64_to_user_ptr(READ_ONCE(cmd->sqe->optval)); - optname = READ_ONCE(cmd->sqe->optname); - optlen = READ_ONCE(cmd->sqe->optlen); - level = READ_ONCE(cmd->sqe->level); - optval_s = USER_SOCKPTR(optval); - - return do_sock_setsockopt(sock, compat, level, optname, optval_s, - optlen); + return io_import_reg_vec(ddir, iter, req, &ac->vec, uvec_segs, + issue_flags); } +EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed_vec); -#if defined(CONFIG_NET) -int io_uring_cmd_sock(struct io_uring_cmd *cmd, unsigned int issue_flags) +void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd) { - struct socket *sock = cmd->file->private_data; - struct sock *sk = sock->sk; - struct proto *prot = READ_ONCE(sk->sk_prot); - int ret, arg = 0; - - if (!prot || !prot->ioctl) - return -EOPNOTSUPP; + struct io_kiocb *req = cmd_to_io_kiocb(ioucmd); - switch (cmd->sqe->cmd_op) { - case SOCKET_URING_OP_SIOCINQ: - ret = prot->ioctl(sk, SIOCINQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_SIOCOUTQ: - ret = prot->ioctl(sk, SIOCOUTQ, &arg); - if (ret) - return ret; - return arg; - case SOCKET_URING_OP_GETSOCKOPT: - return io_uring_cmd_getsockopt(sock, cmd, issue_flags); - case SOCKET_URING_OP_SETSOCKOPT: - return io_uring_cmd_setsockopt(sock, cmd, issue_flags); - default: - return -EOPNOTSUPP; - } + io_req_queue_iowq(req); } -EXPORT_SYMBOL_GPL(io_uring_cmd_sock); -#endif diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index a361f98664d2..e6a5142c890e 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -1,11 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 -struct uring_cache { - struct io_uring_sqe sqes[2]; +#include <linux/io_uring/cmd.h> +#include <linux/io_uring_types.h> + +struct io_async_cmd { + struct io_uring_cmd_data data; + struct iou_vec vec; + struct io_uring_sqe sqes[2]; }; int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags); int io_uring_cmd_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); +void io_uring_cmd_cleanup(struct io_kiocb *req); bool io_uring_try_cancel_uring_cmd(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all); + struct io_uring_task *tctx, bool cancel_all); + +void io_cmd_cache_free(const void *entry); diff --git a/io_uring/waitid.c b/io_uring/waitid.c index 6362ec20abc0..e07a94694397 100644 --- a/io_uring/waitid.c +++ b/io_uring/waitid.c @@ -16,7 +16,7 @@ #include "waitid.h" #include "../kernel/exit.h" -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts); +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw); #define IO_WAITID_CANCEL_FLAG BIT(31) #define IO_WAITID_REF_MASK GENMASK(30, 0) @@ -42,7 +42,6 @@ static void io_waitid_free(struct io_kiocb *req) req->flags &= ~REQ_F_ASYNC_DATA; } -#ifdef CONFIG_COMPAT static bool io_waitid_compat_copy_si(struct io_waitid *iw, int signo) { struct compat_siginfo __user *infop; @@ -67,7 +66,6 @@ Efault: ret = false; goto done; } -#endif static bool io_waitid_copy_si(struct io_kiocb *req, int signo) { @@ -77,10 +75,8 @@ static bool io_waitid_copy_si(struct io_kiocb *req, int signo) if (!iw->infop) return true; -#ifdef CONFIG_COMPAT - if (req->ctx->compat) + if (io_is_compat(req->ctx)) return io_waitid_compat_copy_si(iw, signo); -#endif if (!user_write_access_begin(iw->infop, sizeof(*iw->infop))) return false; @@ -118,7 +114,6 @@ static int io_waitid_finish(struct io_kiocb *req, int ret) static void io_waitid_complete(struct io_kiocb *req, int ret) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); - struct io_tw_state ts = {}; /* anyone completing better be holding a reference */ WARN_ON_ONCE(!(atomic_read(&iw->refs) & IO_WAITID_REF_MASK)); @@ -131,10 +126,9 @@ static void io_waitid_complete(struct io_kiocb *req, int ret) if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - io_req_task_complete(req, &ts); } -static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) +static bool __io_waitid_cancel(struct io_kiocb *req) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); struct io_waitid_async *iwa = req->async_data; @@ -153,55 +147,20 @@ static bool __io_waitid_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) list_del_init(&iwa->wo.child_wait.entry); spin_unlock_irq(&iw->head->lock); io_waitid_complete(req, -ECANCELED); + io_req_queue_tw_complete(req, -ECANCELED); return true; } int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags) { - struct hlist_node *tmp; - struct io_kiocb *req; - int nr = 0; - - if (cd->flags & (IORING_ASYNC_CANCEL_FD|IORING_ASYNC_CANCEL_FD_FIXED)) - return -ENOENT; - - io_ring_submit_lock(ctx, issue_flags); - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (req->cqe.user_data != cd->data && - !(cd->flags & IORING_ASYNC_CANCEL_ANY)) - continue; - if (__io_waitid_cancel(ctx, req)) - nr++; - if (!(cd->flags & IORING_ASYNC_CANCEL_ALL)) - break; - } - io_ring_submit_unlock(ctx, issue_flags); - - if (nr) - return nr; - - return -ENOENT; + return io_cancel_remove(ctx, cd, issue_flags, &ctx->waitid_list, __io_waitid_cancel); } -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all) { - struct hlist_node *tmp; - struct io_kiocb *req; - bool found = false; - - lockdep_assert_held(&ctx->uring_lock); - - hlist_for_each_entry_safe(req, tmp, &ctx->waitid_list, hash_node) { - if (!io_match_task_safe(req, task, cancel_all)) - continue; - hlist_del_init(&req->hash_node); - __io_waitid_cancel(ctx, req); - found = true; - } - - return found; + return io_cancel_remove_all(ctx, tctx, &ctx->waitid_list, cancel_all, __io_waitid_cancel); } static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) @@ -222,13 +181,13 @@ static inline bool io_waitid_drop_issue_ref(struct io_kiocb *req) return true; } -static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) +static void io_waitid_cb(struct io_kiocb *req, io_tw_token_t tw) { struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; int ret; - io_tw_lock(ctx, ts); + io_tw_lock(ctx, tw); ret = __do_wait(&iwa->wo); @@ -258,6 +217,7 @@ static void io_waitid_cb(struct io_kiocb *req, struct io_tw_state *ts) } io_waitid_complete(req, ret); + io_req_task_complete(req, tw); } static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, @@ -285,10 +245,16 @@ static int io_waitid_wait(struct wait_queue_entry *wait, unsigned mode, int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa; if (sqe->addr || sqe->buf_index || sqe->addr3 || sqe->waitid_flags) return -EINVAL; + iwa = io_uring_alloc_async_data(NULL, req); + if (!unlikely(iwa)) + return -ENOMEM; + iwa->req = req; + iw->which = READ_ONCE(sqe->len); iw->upid = READ_ONCE(sqe->fd); iw->options = READ_ONCE(sqe->file_index); @@ -299,16 +265,10 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) int io_waitid(struct io_kiocb *req, unsigned int issue_flags) { struct io_waitid *iw = io_kiocb_to_cmd(req, struct io_waitid); + struct io_waitid_async *iwa = req->async_data; struct io_ring_ctx *ctx = req->ctx; - struct io_waitid_async *iwa; int ret; - if (io_alloc_async_data(req)) - return -ENOMEM; - - iwa = req->async_data; - iwa->req = req; - ret = kernel_waitid_prepare(&iwa->wo, iw->which, iw->upid, &iw->info, iw->options, NULL); if (ret) @@ -331,7 +291,7 @@ int io_waitid(struct io_kiocb *req, unsigned int issue_flags) hlist_add_head(&req->hash_node, &ctx->waitid_list); init_waitqueue_func_entry(&iwa->wo.child_wait, io_waitid_wait); - iwa->wo.child_wait.private = req->task; + iwa->wo.child_wait.private = req->tctx->task; iw->head = ¤t->signal->wait_chldexit; add_wait_queue(iw->head, &iwa->wo.child_wait); @@ -363,5 +323,5 @@ done: if (ret < 0) req_set_fail(req); io_req_set_res(req, ret, 0); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/waitid.h b/io_uring/waitid.h index 956a8adafe8c..d5544aaf302a 100644 --- a/io_uring/waitid.h +++ b/io_uring/waitid.h @@ -11,5 +11,5 @@ int io_waitid_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_waitid(struct io_kiocb *req, unsigned int issue_flags); int io_waitid_cancel(struct io_ring_ctx *ctx, struct io_cancel_data *cd, unsigned int issue_flags); -bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct task_struct *task, +bool io_waitid_remove_all(struct io_ring_ctx *ctx, struct io_uring_task *tctx, bool cancel_all); diff --git a/io_uring/xattr.c b/io_uring/xattr.c index 6cf41c3bc369..322b94ff9e4b 100644 --- a/io_uring/xattr.c +++ b/io_uring/xattr.c @@ -18,7 +18,7 @@ struct io_xattr { struct file *file; - struct xattr_ctx ctx; + struct kernel_xattr_ctx ctx; struct filename *filename; }; @@ -48,13 +48,10 @@ static int __io_getxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - ix->filename = NULL; ix->ctx.kvalue = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); - ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + ix->ctx.value = u64_to_user_ptr(READ_ONCE(sqe->addr2)); ix->ctx.size = READ_ONCE(sqe->len); ix->ctx.flags = READ_ONCE(sqe->xattr_flags); @@ -65,11 +62,8 @@ static int __io_getxattr_prep(struct io_kiocb *req, if (!ix->ctx.kname) return -ENOMEM; - ret = strncpy_from_user(ix->ctx.kname->name, name, - sizeof(ix->ctx.kname->name)); - if (!ret || ret == sizeof(ix->ctx.kname->name)) - ret = -ERANGE; - if (ret < 0) { + ret = import_xattr_name(ix->ctx.kname, name); + if (ret) { kfree(ix->ctx.kname); return ret; } @@ -90,19 +84,20 @@ int io_getxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) const char __user *path; int ret; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + ret = __io_getxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } + ix->filename = getname(path); + if (IS_ERR(ix->filename)) + return PTR_ERR(ix->filename); - return ret; + return 0; } int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) @@ -112,37 +107,22 @@ int io_fgetxattr(struct io_kiocb *req, unsigned int issue_flags) WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = do_getxattr(file_mnt_idmap(req->file), - req->file->f_path.dentry, - &ix->ctx); - + ret = file_getxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_getxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = do_getxattr(mnt_idmap(path.mnt), path.dentry, &ix->ctx); - - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - + ret = filename_getxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); + ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } static int __io_setxattr_prep(struct io_kiocb *req, @@ -152,9 +132,6 @@ static int __io_setxattr_prep(struct io_kiocb *req, const char __user *name; int ret; - if (unlikely(req->flags & REQ_F_FIXED_FILE)) - return -EBADF; - ix->filename = NULL; name = u64_to_user_ptr(READ_ONCE(sqe->addr)); ix->ctx.cvalue = u64_to_user_ptr(READ_ONCE(sqe->addr2)); @@ -183,19 +160,20 @@ int io_setxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) const char __user *path; int ret; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + ret = __io_setxattr_prep(req, sqe); if (ret) return ret; path = u64_to_user_ptr(READ_ONCE(sqe->addr3)); - ix->filename = getname_flags(path, LOOKUP_FOLLOW); - if (IS_ERR(ix->filename)) { - ret = PTR_ERR(ix->filename); - ix->filename = NULL; - } + ix->filename = getname(path); + if (IS_ERR(ix->filename)) + return PTR_ERR(ix->filename); - return ret; + return 0; } int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -203,52 +181,27 @@ int io_fsetxattr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return __io_setxattr_prep(req, sqe); } -static int __io_setxattr(struct io_kiocb *req, unsigned int issue_flags, - const struct path *path) -{ - struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - int ret; - - ret = mnt_want_write(path->mnt); - if (!ret) { - ret = do_setxattr(mnt_idmap(path->mnt), path->dentry, &ix->ctx); - mnt_drop_write(path->mnt); - } - - return ret; -} - int io_fsetxattr(struct io_kiocb *req, unsigned int issue_flags) { + struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); - ret = __io_setxattr(req, issue_flags, &req->file->f_path); + ret = file_setxattr(req->file, &ix->ctx); io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } int io_setxattr(struct io_kiocb *req, unsigned int issue_flags) { struct io_xattr *ix = io_kiocb_to_cmd(req, struct io_xattr); - unsigned int lookup_flags = LOOKUP_FOLLOW; - struct path path; int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); -retry: - ret = filename_lookup(AT_FDCWD, ix->filename, lookup_flags, &path, NULL); - if (!ret) { - ret = __io_setxattr(req, issue_flags, &path); - path_put(&path); - if (retry_estale(ret, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - } - + ret = filename_setxattr(AT_FDCWD, ix->filename, LOOKUP_FOLLOW, &ix->ctx); + ix->filename = NULL; io_xattr_finish(req, ret); - return IOU_OK; + return IOU_COMPLETE; } diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c new file mode 100644 index 000000000000..085eeed8cd50 --- /dev/null +++ b/io_uring/zcrx.c @@ -0,0 +1,1201 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/dma-map-ops.h> +#include <linux/mm.h> +#include <linux/nospec.h> +#include <linux/io_uring.h> +#include <linux/netdevice.h> +#include <linux/rtnetlink.h> +#include <linux/skbuff_ref.h> + +#include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> +#include <net/netlink.h> +#include <net/netdev_rx_queue.h> +#include <net/tcp.h> +#include <net/rps.h> + +#include <trace/events/page_pool.h> + +#include <uapi/linux/io_uring.h> + +#include "io_uring.h" +#include "kbuf.h" +#include "memmap.h" +#include "zcrx.h" +#include "rsrc.h" + +#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) + +static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) +{ + return pp->mp_priv; +} + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->mem.pages[net_iov_idx(niov)]; +} + +static void io_release_dmabuf(struct io_zcrx_mem *mem) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; + } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) { + ret = -EINVAL; + goto err; + } + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; +} + +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages; + + if (area_reg->dmabuf_fd) + return -EINVAL; + if (!area_reg->addr) + return -EFAULT; + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return 0; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + +static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + for (i = 0; i < nr_mapped; i++) { + netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); + dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); + + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + } +} + +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); + + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); +} + +static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + guard(mutex)(&ifq->dma_lock); + + if (area->is_mapped) + __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); + area->is_mapped = false; +} + +static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + int i; + + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + dma_addr_t dma; + + dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, + PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); + if (dma_mapping_error(ifq->dev, dma)) + break; + if (net_mp_niov_set_dma_addr(niov, dma)) { + dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, + DMA_FROM_DEVICE, IO_DMA_ATTR); + break; + } + } + return i; +} + +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned nr; + + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); + + if (nr != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, nr); + return -EINVAL; + } + + area->is_mapped = true; + return 0; +} + +static void io_zcrx_sync_for_device(const struct page_pool *pool, + struct net_iov *niov) +{ +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr; + + if (!dma_dev_need_sync(pool->p.dev)) + return; + + dma_addr = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + PAGE_SIZE, pool->p.dma_dir); +#endif +} + +#define IO_RQ_MAX_ENTRIES 32768 + +#define IO_SKBS_PER_CALL_LIMIT 20 + +struct io_zcrx_args { + struct io_kiocb *req; + struct io_zcrx_ifq *ifq; + struct socket *sock; + unsigned nr_skbs; +}; + +static const struct memory_provider_ops io_uring_pp_zc_ops; + +static inline atomic_t *io_get_user_counter(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return &area->user_refs[net_iov_idx(niov)]; +} + +static bool io_zcrx_put_niov_uref(struct net_iov *niov) +{ + atomic_t *uref = io_get_user_counter(niov); + + if (unlikely(!atomic_read(uref))) + return false; + atomic_dec(uref); + return true; +} + +static void io_zcrx_get_niov_uref(struct net_iov *niov) +{ + atomic_inc(io_get_user_counter(niov)); +} + +static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, + struct io_uring_zcrx_ifq_reg *reg, + struct io_uring_region_desc *rd, + u32 id) +{ + u64 mmap_offset; + size_t off, size; + void *ptr; + int ret; + + off = sizeof(struct io_uring); + size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; + if (size > rd->size) + return -EINVAL; + + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); + if (ret < 0) + return ret; + + ptr = io_region_get_ptr(&ifq->region); + ifq->rq_ring = (struct io_uring *)ptr; + ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); + return 0; +} + +static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) +{ + io_free_region(ifq->ctx, &ifq->region); + ifq->rq_ring = NULL; + ifq->rqes = NULL; +} + +static void io_zcrx_free_area(struct io_zcrx_area *area) +{ + if (area->ifq) + io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); + + kvfree(area->freelist); + kvfree(area->nia.niovs); + kvfree(area->user_refs); + kfree(area); +} + +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + +static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area **res, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct io_zcrx_area *area; + unsigned nr_iovs; + int i, ret; + + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) + return -EINVAL; + if (area_reg->rq_area_token) + return -EINVAL; + if (area_reg->__resv2[0] || area_reg->__resv2[1]) + return -EINVAL; + + ret = -ENOMEM; + area = kzalloc(sizeof(*area), GFP_KERNEL); + if (!area) + goto err; + + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) + goto err; + + nr_iovs = area->mem.size >> PAGE_SHIFT; + area->nia.num_niovs = nr_iovs; + + ret = -ENOMEM; + area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->nia.niovs) + goto err; + + area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->freelist) + goto err; + + area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), + GFP_KERNEL | __GFP_ZERO); + if (!area->user_refs) + goto err; + + for (i = 0; i < nr_iovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + + niov->owner = &area->nia; + area->freelist[i] = i; + atomic_set(&area->user_refs[i], 0); + niov->type = NET_IOV_IOURING; + } + + area->free_count = nr_iovs; + area->ifq = ifq; + /* we're only supporting one area per ifq for now */ + area->area_id = 0; + area_reg->rq_area_token = (u64)area->area_id << IORING_ZCRX_AREA_SHIFT; + spin_lock_init(&area->freelist_lock); + *res = area; + return 0; +err: + if (area) + io_zcrx_free_area(area); + return ret; +} + +static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + ifq = kzalloc(sizeof(*ifq), GFP_KERNEL); + if (!ifq) + return NULL; + + ifq->if_rxq = -1; + ifq->ctx = ctx; + spin_lock_init(&ifq->lock); + spin_lock_init(&ifq->rq_lock); + mutex_init(&ifq->dma_lock); + return ifq; +} + +static void io_zcrx_drop_netdev(struct io_zcrx_ifq *ifq) +{ + spin_lock(&ifq->lock); + if (ifq->netdev) { + netdev_put(ifq->netdev, &ifq->netdev_tracker); + ifq->netdev = NULL; + } + spin_unlock(&ifq->lock); +} + +static void io_close_queue(struct io_zcrx_ifq *ifq) +{ + struct net_device *netdev; + netdevice_tracker netdev_tracker; + struct pp_memory_provider_params p = { + .mp_ops = &io_uring_pp_zc_ops, + .mp_priv = ifq, + }; + + if (ifq->if_rxq == -1) + return; + + spin_lock(&ifq->lock); + netdev = ifq->netdev; + netdev_tracker = ifq->netdev_tracker; + ifq->netdev = NULL; + spin_unlock(&ifq->lock); + + if (netdev) { + net_mp_close_rxq(netdev, ifq->if_rxq, &p); + netdev_put(netdev, &netdev_tracker); + } + ifq->if_rxq = -1; +} + +static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) +{ + io_close_queue(ifq); + io_zcrx_drop_netdev(ifq); + + if (ifq->area) + io_zcrx_free_area(ifq->area); + if (ifq->dev) + put_device(ifq->dev); + + io_free_rbuf_ring(ifq); + mutex_destroy(&ifq->dma_lock); + kfree(ifq); +} + +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + + lockdep_assert_held(&ctx->mmap_lock); + + return ifq ? &ifq->region : NULL; +} + +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + struct pp_memory_provider_params mp_param = {}; + struct io_uring_zcrx_area_reg area; + struct io_uring_zcrx_ifq_reg reg; + struct io_uring_region_desc rd; + struct io_zcrx_ifq *ifq; + int ret; + u32 id; + + /* + * 1. Interface queue allocation. + * 2. It can observe data destined for sockets of other tasks. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + /* mandatory io_uring features for zc rx */ + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && + ctx->flags & IORING_SETUP_CQE32)) + return -EINVAL; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || + reg.__resv2 || reg.zcrx_id) + return -EINVAL; + if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) + return -EINVAL; + if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { + if (!(ctx->flags & IORING_SETUP_CLAMP)) + return -EINVAL; + reg.rq_entries = IO_RQ_MAX_ENTRIES; + } + reg.rq_entries = roundup_pow_of_two(reg.rq_entries); + + if (copy_from_user(&area, u64_to_user_ptr(reg.area_ptr), sizeof(area))) + return -EFAULT; + + ifq = io_zcrx_ifq_alloc(ctx); + if (!ifq) + return -ENOMEM; + ifq->rq_entries = reg.rq_entries; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } + + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); + if (ret) + goto err; + + ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, + &ifq->netdev_tracker, GFP_KERNEL); + if (!ifq->netdev) { + ret = -ENODEV; + goto err; + } + + ifq->dev = ifq->netdev->dev.parent; + if (!ifq->dev) { + ret = -EOPNOTSUPP; + goto err; + } + get_device(ifq->dev); + + ret = io_zcrx_create_area(ifq, &ifq->area, &area); + if (ret) + goto err; + + mp_param.mp_ops = &io_uring_pp_zc_ops; + mp_param.mp_priv = ifq; + ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); + if (ret) + goto err; + ifq->if_rxq = reg.if_rxq; + + reg.offsets.rqes = sizeof(struct io_uring); + reg.offsets.head = offsetof(struct io_uring, head); + reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } + + if (copy_to_user(arg, ®, sizeof(reg)) || + copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || + copy_to_user(u64_to_user_ptr(reg.area_ptr), &area, sizeof(area))) { + ret = -EFAULT; + goto err; + } + return 0; +err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: + io_zcrx_ifq_free(ifq); + return ret; +} + +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + + lockdep_assert_held(&ctx->uring_lock); + + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + unsigned long id = 0; + + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } + + xa_destroy(&ctx->zcrx_ctxs); +} + +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ + struct io_zcrx_ifq *ifq; + unsigned long index; + + lockdep_assert_held(&ctx->uring_lock); + + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } +} + +static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) +{ + u32 entries; + + entries = smp_load_acquire(&ifq->rq_ring->tail) - ifq->cached_rq_head; + return min(entries, ifq->rq_entries); +} + +static struct io_uring_zcrx_rqe *io_zcrx_get_rqe(struct io_zcrx_ifq *ifq, + unsigned mask) +{ + unsigned int idx = ifq->cached_rq_head++ & mask; + + return &ifq->rqes[idx]; +} + +static void io_zcrx_ring_refill(struct page_pool *pp, + struct io_zcrx_ifq *ifq) +{ + unsigned int mask = ifq->rq_entries - 1; + unsigned int entries; + netmem_ref netmem; + + spin_lock_bh(&ifq->rq_lock); + + entries = io_zcrx_rqring_entries(ifq); + entries = min_t(unsigned, entries, PP_ALLOC_CACHE_REFILL - pp->alloc.count); + if (unlikely(!entries)) { + spin_unlock_bh(&ifq->rq_lock); + return; + } + + do { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(ifq, mask); + struct io_zcrx_area *area; + struct net_iov *niov; + unsigned niov_idx, area_idx; + + area_idx = rqe->off >> IORING_ZCRX_AREA_SHIFT; + niov_idx = (rqe->off & ~IORING_ZCRX_AREA_MASK) >> PAGE_SHIFT; + + if (unlikely(rqe->__pad || area_idx)) + continue; + area = ifq->area; + + if (unlikely(niov_idx >= area->nia.num_niovs)) + continue; + niov_idx = array_index_nospec(niov_idx, area->nia.num_niovs); + + niov = &area->nia.niovs[niov_idx]; + if (!io_zcrx_put_niov_uref(niov)) + continue; + + netmem = net_iov_to_netmem(niov); + if (page_pool_unref_netmem(netmem, 1) != 0) + continue; + + if (unlikely(niov->pp != pp)) { + io_zcrx_return_niov(niov); + continue; + } + + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } while (--entries); + + smp_store_release(&ifq->rq_ring->head, ifq->cached_rq_head); + spin_unlock_bh(&ifq->rq_lock); +} + +static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + + spin_lock_bh(&area->freelist_lock); + while (area->free_count && pp->alloc.count < PP_ALLOC_CACHE_REFILL) { + struct net_iov *niov = __io_zcrx_get_free_niov(area); + netmem_ref netmem = net_iov_to_netmem(niov); + + net_mp_niov_set_page_pool(pp, niov); + io_zcrx_sync_for_device(pp, niov); + net_mp_netmem_place_in_cache(pp, netmem); + } + spin_unlock_bh(&area->freelist_lock); +} + +static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) +{ + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + + /* pp should already be ensuring that */ + if (unlikely(pp->alloc.count)) + goto out_return; + + io_zcrx_ring_refill(pp, ifq); + if (likely(pp->alloc.count)) + goto out_return; + + io_zcrx_refill_slow(pp, ifq); + if (!pp->alloc.count) + return 0; +out_return: + return pp->alloc.cache[--pp->alloc.count]; +} + +static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) +{ + struct net_iov *niov; + + if (WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + return false; + + niov = netmem_to_net_iov(netmem); + net_mp_niov_clear_page_pool(niov); + io_zcrx_return_niov_freelist(niov); + return false; +} + +static int io_pp_zc_init(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + int ret; + + if (WARN_ON_ONCE(!ifq)) + return -EINVAL; + if (WARN_ON_ONCE(ifq->dev != pp->p.dev)) + return -EINVAL; + if (WARN_ON_ONCE(!pp->dma_map)) + return -EOPNOTSUPP; + if (pp->p.order != 0) + return -EOPNOTSUPP; + if (pp->p.dma_dir != DMA_FROM_DEVICE) + return -EOPNOTSUPP; + + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + return ret; + + percpu_ref_get(&ifq->ctx->refs); + return 0; +} + +static void io_pp_zc_destroy(struct page_pool *pp) +{ + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + struct io_zcrx_area *area = ifq->area; + + if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) + return; + percpu_ref_put(&ifq->ctx->refs); +} + +static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, + struct netdev_rx_queue *rxq) +{ + struct nlattr *nest; + int type; + + type = rxq ? NETDEV_A_QUEUE_IO_URING : NETDEV_A_PAGE_POOL_IO_URING; + nest = nla_nest_start(rsp, type); + if (!nest) + return -EMSGSIZE; + nla_nest_end(rsp, nest); + + return 0; +} + +static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) +{ + struct pp_memory_provider_params *p = &rxq->mp_params; + struct io_zcrx_ifq *ifq = mp_priv; + + io_zcrx_drop_netdev(ifq); + if (ifq->area) + io_zcrx_unmap_area(ifq, ifq->area); + + p->mp_ops = NULL; + p->mp_priv = NULL; +} + +static const struct memory_provider_ops io_uring_pp_zc_ops = { + .alloc_netmems = io_pp_zc_alloc_netmems, + .release_netmem = io_pp_zc_release_netmem, + .init = io_pp_zc_init, + .destroy = io_pp_zc_destroy, + .nl_fill = io_pp_nl_fill, + .uninstall = io_pp_uninstall, +}; + +static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, + struct io_zcrx_ifq *ifq, int off, int len) +{ + struct io_uring_zcrx_cqe *rcqe; + struct io_zcrx_area *area; + struct io_uring_cqe *cqe; + u64 offset; + + if (!io_defer_get_uncommited_cqe(req->ctx, &cqe)) + return false; + + cqe->user_data = req->cqe.user_data; + cqe->res = len; + cqe->flags = IORING_CQE_F_MORE; + + area = io_zcrx_iov_to_area(niov); + offset = off + (net_iov_idx(niov) << PAGE_SHIFT); + rcqe = (struct io_uring_zcrx_cqe *)(cqe + 1); + rcqe->off = offset + ((u64)area->area_id << IORING_ZCRX_AREA_SHIFT); + rcqe->__pad = 0; + return true; +} + +static struct net_iov *io_zcrx_alloc_fallback(struct io_zcrx_area *area) +{ + struct net_iov *niov = NULL; + + spin_lock_bh(&area->freelist_lock); + if (area->free_count) + niov = __io_zcrx_get_free_niov(area); + spin_unlock_bh(&area->freelist_lock); + + if (niov) + page_pool_fragment_netmem(net_iov_to_netmem(niov), 1); + return niov; +} + +static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + void *src_base, struct page *src_page, + unsigned int src_offset, size_t len) +{ + struct io_zcrx_area *area = ifq->area; + size_t copied = 0; + int ret = 0; + + if (area->mem.is_dmabuf) + return -EFAULT; + + while (len) { + size_t copy_size = min_t(size_t, PAGE_SIZE, len); + const int dst_off = 0; + struct net_iov *niov; + struct page *dst_page; + void *dst_addr; + + niov = io_zcrx_alloc_fallback(area); + if (!niov) { + ret = -ENOMEM; + break; + } + + dst_page = io_zcrx_iov_page(niov); + dst_addr = kmap_local_page(dst_page); + if (src_page) + src_base = kmap_local_page(src_page); + + memcpy(dst_addr, src_base + src_offset, copy_size); + + if (src_page) + kunmap_local(src_base); + kunmap_local(dst_addr); + + if (!io_zcrx_queue_cqe(req, niov, ifq, dst_off, copy_size)) { + io_zcrx_return_niov(niov); + ret = -ENOSPC; + break; + } + + io_zcrx_get_niov_uref(niov); + src_offset += copy_size; + len -= copy_size; + copied += copy_size; + } + + return copied ? copied : ret; +} + +static int io_zcrx_copy_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct page *page = skb_frag_page(frag); + u32 p_off, p_len, t, copied = 0; + int ret = 0; + + off += skb_frag_off(frag); + + skb_frag_foreach_page(frag, off, len, + page, p_off, p_len, t) { + ret = io_zcrx_copy_chunk(req, ifq, NULL, page, p_off, p_len); + if (ret < 0) + return copied ? copied : ret; + copied += ret; + } + return copied; +} + +static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + const skb_frag_t *frag, int off, int len) +{ + struct net_iov *niov; + + if (unlikely(!skb_frag_is_net_iov(frag))) + return io_zcrx_copy_frag(req, ifq, frag, off, len); + + niov = netmem_to_net_iov(frag->netmem); + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || + io_pp_to_ifq(niov->pp) != ifq) + return -EFAULT; + + if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) + return -ENOSPC; + + /* + * Prevent it from being recycled while user is accessing it. + * It has to be done before grabbing a user reference. + */ + page_pool_ref_netmem(net_iov_to_netmem(niov)); + io_zcrx_get_niov_uref(niov); + return len; +} + +static int +io_zcrx_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct io_zcrx_args *args = desc->arg.data; + struct io_zcrx_ifq *ifq = args->ifq; + struct io_kiocb *req = args->req; + struct sk_buff *frag_iter; + unsigned start, start_off = offset; + int i, copy, end, off; + int ret = 0; + + len = min_t(size_t, len, desc->count); + /* + * __tcp_read_sock() always calls io_zcrx_recv_skb one last time, even + * if desc->count is already 0. This is caused by the if (offset + 1 != + * skb->len) check. Return early in this case to break out of + * __tcp_read_sock(). + */ + if (!len) + return 0; + if (unlikely(args->nr_skbs++ > IO_SKBS_PER_CALL_LIMIT)) + return -EAGAIN; + + if (unlikely(offset < skb_headlen(skb))) { + ssize_t copied; + size_t to_copy; + + to_copy = min_t(size_t, skb_headlen(skb) - offset, len); + copied = io_zcrx_copy_chunk(req, ifq, skb->data, NULL, + offset, to_copy); + if (copied < 0) { + ret = copied; + goto out; + } + offset += copied; + len -= copied; + if (!len) + goto out; + if (offset != skb_headlen(skb)) + goto out; + } + + start = skb_headlen(skb); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + const skb_frag_t *frag; + + if (WARN_ON(start > offset + len)) + return -EFAULT; + + frag = &skb_shinfo(skb)->frags[i]; + end = start + skb_frag_size(frag); + + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_frag(req, ifq, frag, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + if (WARN_ON(start > offset + len)) + return -EFAULT; + + end = start + frag_iter->len; + if (offset < end) { + copy = end - offset; + if (copy > len) + copy = len; + + off = offset - start; + ret = io_zcrx_recv_skb(desc, frag_iter, off, copy); + if (ret < 0) + goto out; + + offset += ret; + len -= ret; + if (len == 0 || ret != copy) + goto out; + } + start = end; + } + +out: + if (offset == start_off) + return ret; + desc->count -= (offset - start_off); + return offset - start_off; +} + +static int io_zcrx_tcp_recvmsg(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct sock *sk, int flags, + unsigned issue_flags, unsigned int *outlen) +{ + unsigned int len = *outlen; + struct io_zcrx_args args = { + .req = req, + .ifq = ifq, + .sock = sk->sk_socket, + }; + read_descriptor_t rd_desc = { + .count = len ? len : UINT_MAX, + .arg.data = &args, + }; + int ret; + + lock_sock(sk); + ret = tcp_read_sock(sk, &rd_desc, io_zcrx_recv_skb); + if (len && ret > 0) + *outlen = len - ret; + if (ret <= 0) { + if (ret < 0 || sock_flag(sk, SOCK_DONE)) + goto out; + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + else if (sk->sk_state == TCP_CLOSE) + ret = -ENOTCONN; + else + ret = -EAGAIN; + } else if (unlikely(args.nr_skbs > IO_SKBS_PER_CALL_LIMIT) && + (issue_flags & IO_URING_F_MULTISHOT)) { + ret = IOU_REQUEUE; + } else if (sock_flag(sk, SOCK_DONE)) { + /* Make it to retry until it finally gets 0. */ + if (issue_flags & IO_URING_F_MULTISHOT) + ret = IOU_REQUEUE; + else + ret = -EAGAIN; + } +out: + release_sock(sk); + return ret; +} + +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + struct sock *sk = sock->sk; + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->recvmsg != tcp_recvmsg) + return -EPROTONOSUPPORT; + + sock_rps_record_flow(sk); + return io_zcrx_tcp_recvmsg(req, ifq, sk, flags, issue_flags, len); +} diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h new file mode 100644 index 000000000000..2f5e26389f22 --- /dev/null +++ b/io_uring/zcrx.h @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef IOU_ZC_RX_H +#define IOU_ZC_RX_H + +#include <linux/io_uring_types.h> +#include <linux/dma-buf.h> +#include <linux/socket.h> +#include <net/page_pool/types.h> +#include <net/net_trackers.h> + +struct io_zcrx_mem { + unsigned long size; + bool is_dmabuf; + + struct page **pages; + unsigned long nr_folios; + + struct dma_buf_attachment *attach; + struct dma_buf *dmabuf; + struct sg_table *sgt; + unsigned long dmabuf_offset; +}; + +struct io_zcrx_area { + struct net_iov_area nia; + struct io_zcrx_ifq *ifq; + atomic_t *user_refs; + + bool is_mapped; + u16 area_id; + + /* freelist */ + spinlock_t freelist_lock ____cacheline_aligned_in_smp; + u32 free_count; + u32 *freelist; + + struct io_zcrx_mem mem; +}; + +struct io_zcrx_ifq { + struct io_ring_ctx *ctx; + struct io_zcrx_area *area; + + spinlock_t rq_lock ____cacheline_aligned_in_smp; + struct io_uring *rq_ring; + struct io_uring_zcrx_rqe *rqes; + u32 cached_rq_head; + u32 rq_entries; + + u32 if_rxq; + struct device *dev; + struct net_device *netdev; + netdevice_tracker netdev_tracker; + spinlock_t lock; + struct mutex dma_lock; + struct io_mapped_region region; +}; + +#if defined(CONFIG_IO_URING_ZCRX) +int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg); +void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx); +void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx); +int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len); +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id); +#else +static inline int io_register_zcrx_ifq(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg) +{ + return -EOPNOTSUPP; +} +static inline void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) +{ +} +static inline int io_zcrx_recv(struct io_kiocb *req, struct io_zcrx_ifq *ifq, + struct socket *sock, unsigned int flags, + unsigned issue_flags, unsigned int *len) +{ + return -EOPNOTSUPP; +} +static inline struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + return NULL; +} +#endif + +int io_recvzc(struct io_kiocb *req, unsigned int issue_flags); +int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); + +#endif |