summaryrefslogtreecommitdiff
path: root/io_uring/net.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 23:20:44 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2022-08-02 23:20:44 +0300
commitb349b1181d24af1c151134a3c39725e94a5619dd (patch)
tree7347cc4035de947c22e575ac7c649c0fa8658dd1 /io_uring/net.c
parentefb2883060afc79638bb1eb19e2c30e7f6c5a178 (diff)
parentf6b543fd03d347e8bf245cee4f2d54eb6ffd8fcb (diff)
downloadlinux-b349b1181d24af1c151134a3c39725e94a5619dd.tar.xz
Merge tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block
Pull io_uring updates from Jens Axboe: - As per (valid) complaint in the last merge window, fs/io_uring.c has grown quite large these days. io_uring isn't really tied to fs either, as it supports a wide variety of functionality outside of that. Move the code to io_uring/ and split it into files that either implement a specific request type, and split some code into helpers as well. The code is organized a lot better like this, and io_uring.c is now < 4K LOC (me). - Deprecate the epoll_ctl opcode. It'll still work, just trigger a warning once if used. If we don't get any complaints on this, and I don't expect any, then we can fully remove it in a future release (me). - Improve the cancel hash locking (Hao) - kbuf cleanups (Hao) - Efficiency improvements to the task_work handling (Dylan, Pavel) - Provided buffer improvements (Dylan) - Add support for recv/recvmsg multishot support. This is similar to the accept (or poll) support for have for multishot, where a single SQE can trigger everytime data is received. For applications that expect to do more than a few receives on an instantiated socket, this greatly improves efficiency (Dylan). - Efficiency improvements for poll handling (Pavel) - Poll cancelation improvements (Pavel) - Allow specifiying a range for direct descriptor allocations (Pavel) - Cleanup the cqe32 handling (Pavel) - Move io_uring types to greatly cleanup the tracing (Pavel) - Tons of great code cleanups and improvements (Pavel) - Add a way to do sync cancelations rather than through the sqe -> cqe interface, as that's a lot easier to use for some use cases (me). - Add support to IORING_OP_MSG_RING for sending direct descriptors to a different ring. This avoids the usually problematic SCM case, as we disallow those. (me) - Make the per-command alloc cache we use for apoll generic, place limits on it, and use it for netmsg as well (me). - Various cleanups (me, Michal, Gustavo, Uros) * tag 'for-5.20/io_uring-2022-07-29' of git://git.kernel.dk/linux-block: (172 commits) io_uring: ensure REQ_F_ISREG is set async offload net: fix compat pointer in get_compat_msghdr() io_uring: Don't require reinitable percpu_ref io_uring: fix types in io_recvmsg_multishot_overflow io_uring: Use atomic_long_try_cmpxchg in __io_account_mem io_uring: support multishot in recvmsg net: copy from user before calling __get_compat_msghdr net: copy from user before calling __copy_msghdr io_uring: support 0 length iov in buffer select in compat io_uring: fix multishot ending when not polled io_uring: add netmsg cache io_uring: impose max limit on apoll cache io_uring: add abstraction around apoll cache io_uring: move apoll cache to poll.c io_uring: consolidate hash_locked io-wq handling io_uring: clear REQ_F_HASH_LOCKED on hash removal io_uring: don't race double poll setting REQ_F_ASYNC_DATA io_uring: don't miss setting REQ_F_DOUBLE_POLL io_uring: disable multishot recvmsg io_uring: only trace one of complete or overflow ...
Diffstat (limited to 'io_uring/net.c')
-rw-r--r--io_uring/net.c1047
1 files changed, 1047 insertions, 0 deletions
diff --git a/io_uring/net.c b/io_uring/net.c
new file mode 100644
index 000000000000..e61efa31c729
--- /dev/null
+++ b/io_uring/net.c
@@ -0,0 +1,1047 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/compat.h>
+#include <net/compat.h>
+#include <linux/io_uring.h>
+
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "kbuf.h"
+#include "alloc_cache.h"
+#include "net.h"
+
+#if defined(CONFIG_NET)
+struct io_shutdown {
+ struct file *file;
+ int how;
+};
+
+struct io_accept {
+ struct file *file;
+ struct sockaddr __user *addr;
+ int __user *addr_len;
+ int flags;
+ u32 file_slot;
+ unsigned long nofile;
+};
+
+struct io_socket {
+ struct file *file;
+ int domain;
+ int type;
+ int protocol;
+ int flags;
+ u32 file_slot;
+ unsigned long nofile;
+};
+
+struct io_connect {
+ struct file *file;
+ struct sockaddr __user *addr;
+ int addr_len;
+};
+
+struct io_sr_msg {
+ struct file *file;
+ union {
+ struct compat_msghdr __user *umsg_compat;
+ struct user_msghdr __user *umsg;
+ void __user *buf;
+ };
+ int msg_flags;
+ size_t len;
+ size_t done_io;
+ unsigned int flags;
+};
+
+#define IO_APOLL_MULTI_POLLED (REQ_F_APOLL_MULTISHOT | REQ_F_POLLED)
+
+int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+
+ if (unlikely(sqe->off || sqe->addr || sqe->rw_flags ||
+ sqe->buf_index || sqe->splice_fd_in))
+ return -EINVAL;
+
+ shutdown->how = READ_ONCE(sqe->len);
+ return 0;
+}
+
+int io_shutdown(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_shutdown *shutdown = io_kiocb_to_cmd(req);
+ struct socket *sock;
+ int ret;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = __sys_shutdown_sock(sock, shutdown->how);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
+static bool io_net_retry(struct socket *sock, int flags)
+{
+ if (!(flags & MSG_WAITALL))
+ return false;
+ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET;
+}
+
+static void io_netmsg_recycle(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_async_msghdr *hdr = req->async_data;
+
+ if (!hdr || issue_flags & IO_URING_F_UNLOCKED)
+ return;
+
+ /* Let normal cleanup path reap it if we fail adding to the cache */
+ if (io_alloc_cache_put(&req->ctx->netmsg_cache, &hdr->cache)) {
+ req->async_data = NULL;
+ req->flags &= ~REQ_F_ASYNC_DATA;
+ }
+}
+
+static struct io_async_msghdr *io_recvmsg_alloc_async(struct io_kiocb *req,
+ unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_cache_entry *entry;
+
+ if (!(issue_flags & IO_URING_F_UNLOCKED) &&
+ (entry = io_alloc_cache_get(&ctx->netmsg_cache)) != NULL) {
+ struct io_async_msghdr *hdr;
+
+ hdr = container_of(entry, struct io_async_msghdr, cache);
+ req->flags |= REQ_F_ASYNC_DATA;
+ req->async_data = hdr;
+ return hdr;
+ }
+
+ if (!io_alloc_async_data(req))
+ return req->async_data;
+
+ return NULL;
+}
+
+static int io_setup_async_msg(struct io_kiocb *req,
+ struct io_async_msghdr *kmsg,
+ unsigned int issue_flags)
+{
+ struct io_async_msghdr *async_msg = req->async_data;
+
+ if (async_msg)
+ return -EAGAIN;
+ async_msg = io_recvmsg_alloc_async(req, issue_flags);
+ if (!async_msg) {
+ kfree(kmsg->free_iov);
+ return -ENOMEM;
+ }
+ req->flags |= REQ_F_NEED_CLEANUP;
+ memcpy(async_msg, kmsg, sizeof(*kmsg));
+ async_msg->msg.msg_name = &async_msg->addr;
+ /* if were using fast_iov, set it to the new one */
+ if (!async_msg->free_iov)
+ async_msg->msg.msg_iter.iov = async_msg->fast_iov;
+
+ return -EAGAIN;
+}
+
+static int io_sendmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+ iomsg->msg.msg_name = &iomsg->addr;
+ iomsg->free_iov = iomsg->fast_iov;
+ return sendmsg_copy_msghdr(&iomsg->msg, sr->umsg, sr->msg_flags,
+ &iomsg->free_iov);
+}
+
+int io_sendmsg_prep_async(struct io_kiocb *req)
+{
+ int ret;
+
+ ret = io_sendmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
+void io_sendmsg_recvmsg_cleanup(struct io_kiocb *req)
+{
+ struct io_async_msghdr *io = req->async_data;
+
+ kfree(io->free_iov);
+}
+
+int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+ if (unlikely(sqe->file_index || sqe->addr2))
+ return -EINVAL;
+
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->len = READ_ONCE(sqe->len);
+ sr->flags = READ_ONCE(sqe->ioprio);
+ if (sr->flags & ~IORING_RECVSEND_POLL_FIRST)
+ return -EINVAL;
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+ sr->done_io = 0;
+ return 0;
+}
+
+int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct io_async_msghdr iomsg, *kmsg;
+ struct socket *sock;
+ unsigned flags;
+ int min_ret = 0;
+ int ret;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
+ ret = io_sendmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg, issue_flags);
+
+ flags = sr->msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+ ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return io_setup_async_msg(req, kmsg, issue_flags);
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg, issue_flags);
+ }
+ req_set_fail(req);
+ }
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ io_netmsg_recycle(req, issue_flags);
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
+int io_send(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct msghdr msg;
+ struct iovec iov;
+ struct socket *sock;
+ unsigned flags;
+ int min_ret = 0;
+ int ret;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ return ret;
+
+ msg.msg_name = NULL;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_namelen = 0;
+
+ flags = sr->msg_flags;
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
+ msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &msg);
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ }
+ if (ret >= 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
+static bool io_recvmsg_multishot_overflow(struct io_async_msghdr *iomsg)
+{
+ int hdr;
+
+ if (iomsg->namelen < 0)
+ return true;
+ if (check_add_overflow((int)sizeof(struct io_uring_recvmsg_out),
+ iomsg->namelen, &hdr))
+ return true;
+ if (check_add_overflow(hdr, (int)iomsg->controllen, &hdr))
+ return true;
+
+ return false;
+}
+
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct user_msghdr msg;
+ int ret;
+
+ if (copy_from_user(&msg, sr->umsg, sizeof(*sr->umsg)))
+ return -EFAULT;
+
+ ret = __copy_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
+ if (ret)
+ return ret;
+
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ if (msg.msg_iovlen == 0) {
+ sr->len = iomsg->fast_iov[0].iov_len = 0;
+ iomsg->fast_iov[0].iov_base = NULL;
+ iomsg->free_iov = NULL;
+ } else if (msg.msg_iovlen > 1) {
+ return -EINVAL;
+ } else {
+ if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
+ return -EFAULT;
+ sr->len = iomsg->fast_iov[0].iov_len;
+ iomsg->free_iov = NULL;
+ }
+
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ iomsg->namelen = msg.msg_namelen;
+ iomsg->controllen = msg.msg_controllen;
+ if (io_recvmsg_multishot_overflow(iomsg))
+ return -EOVERFLOW;
+ }
+ } else {
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(READ, msg.msg_iov, msg.msg_iovlen, UIO_FASTIOV,
+ &iomsg->free_iov, &iomsg->msg.msg_iter,
+ false);
+ if (ret > 0)
+ ret = 0;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct compat_msghdr msg;
+ struct compat_iovec __user *uiov;
+ int ret;
+
+ if (copy_from_user(&msg, sr->umsg_compat, sizeof(msg)))
+ return -EFAULT;
+
+ ret = __get_compat_msghdr(&iomsg->msg, &msg, &iomsg->uaddr);
+ if (ret)
+ return ret;
+
+ uiov = compat_ptr(msg.msg_iov);
+ if (req->flags & REQ_F_BUFFER_SELECT) {
+ compat_ssize_t clen;
+
+ if (msg.msg_iovlen == 0) {
+ sr->len = 0;
+ iomsg->free_iov = NULL;
+ } else if (msg.msg_iovlen > 1) {
+ return -EINVAL;
+ } else {
+ if (!access_ok(uiov, sizeof(*uiov)))
+ return -EFAULT;
+ if (__get_user(clen, &uiov->iov_len))
+ return -EFAULT;
+ if (clen < 0)
+ return -EINVAL;
+ sr->len = clen;
+ iomsg->free_iov = NULL;
+ }
+
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ iomsg->namelen = msg.msg_namelen;
+ iomsg->controllen = msg.msg_controllen;
+ if (io_recvmsg_multishot_overflow(iomsg))
+ return -EOVERFLOW;
+ }
+ } else {
+ iomsg->free_iov = iomsg->fast_iov;
+ ret = __import_iovec(READ, (struct iovec __user *)uiov, msg.msg_iovlen,
+ UIO_FASTIOV, &iomsg->free_iov,
+ &iomsg->msg.msg_iter, true);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+#endif
+
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+ struct io_async_msghdr *iomsg)
+{
+ iomsg->msg.msg_name = &iomsg->addr;
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ return __io_compat_recvmsg_copy_hdr(req, iomsg);
+#endif
+
+ return __io_recvmsg_copy_hdr(req, iomsg);
+}
+
+int io_recvmsg_prep_async(struct io_kiocb *req)
+{
+ int ret;
+
+ ret = io_recvmsg_copy_hdr(req, req->async_data);
+ if (!ret)
+ req->flags |= REQ_F_NEED_CLEANUP;
+ return ret;
+}
+
+#define RECVMSG_FLAGS (IORING_RECVSEND_POLL_FIRST | IORING_RECV_MULTISHOT)
+
+int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+ if (unlikely(sqe->file_index || sqe->addr2))
+ return -EINVAL;
+
+ sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ sr->len = READ_ONCE(sqe->len);
+ sr->flags = READ_ONCE(sqe->ioprio);
+ if (sr->flags & ~(RECVMSG_FLAGS))
+ return -EINVAL;
+ sr->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL;
+ if (sr->msg_flags & MSG_DONTWAIT)
+ req->flags |= REQ_F_NOWAIT;
+ if (sr->msg_flags & MSG_ERRQUEUE)
+ req->flags |= REQ_F_CLEAR_POLLIN;
+ if (sr->flags & IORING_RECV_MULTISHOT) {
+ if (!(req->flags & REQ_F_BUFFER_SELECT))
+ return -EINVAL;
+ if (sr->msg_flags & MSG_WAITALL)
+ return -EINVAL;
+ if (req->opcode == IORING_OP_RECV && sr->len)
+ return -EINVAL;
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (req->ctx->compat)
+ sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+ sr->done_io = 0;
+ return 0;
+}
+
+static inline void io_recv_prep_retry(struct io_kiocb *req)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+
+ sr->done_io = 0;
+ sr->len = 0; /* get from the provided buffer */
+}
+
+/*
+ * Finishes io_recv and io_recvmsg.
+ *
+ * Returns true if it is actually finished, or false if it should run
+ * again (for multishot).
+ */
+static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
+ unsigned int cflags, bool mshot_finished)
+{
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ io_req_set_res(req, *ret, cflags);
+ *ret = IOU_OK;
+ return true;
+ }
+
+ if (!mshot_finished) {
+ if (io_post_aux_cqe(req->ctx, req->cqe.user_data, *ret,
+ cflags | IORING_CQE_F_MORE, false)) {
+ io_recv_prep_retry(req);
+ return false;
+ }
+ /*
+ * Otherwise stop multishot but use the current result.
+ * Probably will end up going into overflow, but this means
+ * we cannot trust the ordering anymore
+ */
+ }
+
+ io_req_set_res(req, *ret, cflags);
+
+ if (req->flags & REQ_F_POLLED)
+ *ret = IOU_STOP_MULTISHOT;
+ else
+ *ret = IOU_OK;
+ return true;
+}
+
+static int io_recvmsg_prep_multishot(struct io_async_msghdr *kmsg,
+ struct io_sr_msg *sr, void __user **buf,
+ size_t *len)
+{
+ unsigned long ubuf = (unsigned long) *buf;
+ unsigned long hdr;
+
+ hdr = sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+ kmsg->controllen;
+ if (*len < hdr)
+ return -EFAULT;
+
+ if (kmsg->controllen) {
+ unsigned long control = ubuf + hdr - kmsg->controllen;
+
+ kmsg->msg.msg_control_user = (void *) control;
+ kmsg->msg.msg_controllen = kmsg->controllen;
+ }
+
+ sr->buf = *buf; /* stash for later copy */
+ *buf = (void *) (ubuf + hdr);
+ kmsg->payloadlen = *len = *len - hdr;
+ return 0;
+}
+
+struct io_recvmsg_multishot_hdr {
+ struct io_uring_recvmsg_out msg;
+ struct sockaddr_storage addr;
+};
+
+static int io_recvmsg_multishot(struct socket *sock, struct io_sr_msg *io,
+ struct io_async_msghdr *kmsg,
+ unsigned int flags, bool *finished)
+{
+ int err;
+ int copy_len;
+ struct io_recvmsg_multishot_hdr hdr;
+
+ if (kmsg->namelen)
+ kmsg->msg.msg_name = &hdr.addr;
+ kmsg->msg.msg_flags = flags & (MSG_CMSG_CLOEXEC|MSG_CMSG_COMPAT);
+ kmsg->msg.msg_namelen = 0;
+
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+
+ err = sock_recvmsg(sock, &kmsg->msg, flags);
+ *finished = err <= 0;
+ if (err < 0)
+ return err;
+
+ hdr.msg = (struct io_uring_recvmsg_out) {
+ .controllen = kmsg->controllen - kmsg->msg.msg_controllen,
+ .flags = kmsg->msg.msg_flags & ~MSG_CMSG_COMPAT
+ };
+
+ hdr.msg.payloadlen = err;
+ if (err > kmsg->payloadlen)
+ err = kmsg->payloadlen;
+
+ copy_len = sizeof(struct io_uring_recvmsg_out);
+ if (kmsg->msg.msg_namelen > kmsg->namelen)
+ copy_len += kmsg->namelen;
+ else
+ copy_len += kmsg->msg.msg_namelen;
+
+ /*
+ * "fromlen shall refer to the value before truncation.."
+ * 1003.1g
+ */
+ hdr.msg.namelen = kmsg->msg.msg_namelen;
+
+ /* ensure that there is no gap between hdr and sockaddr_storage */
+ BUILD_BUG_ON(offsetof(struct io_recvmsg_multishot_hdr, addr) !=
+ sizeof(struct io_uring_recvmsg_out));
+ if (copy_to_user(io->buf, &hdr, copy_len)) {
+ *finished = true;
+ return -EFAULT;
+ }
+
+ return sizeof(struct io_uring_recvmsg_out) + kmsg->namelen +
+ kmsg->controllen + err;
+}
+
+int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct io_async_msghdr iomsg, *kmsg;
+ struct socket *sock;
+ unsigned int cflags;
+ unsigned flags;
+ int ret, min_ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ bool mshot_finished = true;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+ if (req_has_async_data(req)) {
+ kmsg = req->async_data;
+ } else {
+ ret = io_recvmsg_copy_hdr(req, &iomsg);
+ if (ret)
+ return ret;
+ kmsg = &iomsg;
+ }
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return io_setup_async_msg(req, kmsg, issue_flags);
+
+retry_multishot:
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+ size_t len = sr->len;
+
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+
+ if (req->flags & REQ_F_APOLL_MULTISHOT) {
+ ret = io_recvmsg_prep_multishot(kmsg, sr, &buf, &len);
+ if (ret) {
+ io_kbuf_recycle(req, issue_flags);
+ return ret;
+ }
+ }
+
+ kmsg->fast_iov[0].iov_base = buf;
+ kmsg->fast_iov[0].iov_len = len;
+ iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->fast_iov, 1,
+ len);
+ }
+
+ flags = sr->msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+
+ kmsg->msg.msg_get_inq = 1;
+ if (req->flags & REQ_F_APOLL_MULTISHOT)
+ ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
+ &mshot_finished);
+ else
+ ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
+ kmsg->uaddr, flags);
+
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock) {
+ ret = io_setup_async_msg(req, kmsg, issue_flags);
+ if (ret == -EAGAIN && (req->flags & IO_APOLL_MULTI_POLLED) ==
+ IO_APOLL_MULTI_POLLED) {
+ io_kbuf_recycle(req, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+ return ret;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return io_setup_async_msg(req, kmsg, issue_flags);
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+ req_set_fail(req);
+ }
+
+ if (ret > 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ else
+ io_kbuf_recycle(req, issue_flags);
+
+ cflags = io_put_kbuf(req, issue_flags);
+ if (kmsg->msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+
+ if (!io_recv_finish(req, &ret, cflags, mshot_finished))
+ goto retry_multishot;
+
+ if (mshot_finished) {
+ io_netmsg_recycle(req, issue_flags);
+ /* fast path, check for non-NULL to avoid function call */
+ if (kmsg->free_iov)
+ kfree(kmsg->free_iov);
+ req->flags &= ~REQ_F_NEED_CLEANUP;
+ }
+
+ return ret;
+}
+
+int io_recv(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_sr_msg *sr = io_kiocb_to_cmd(req);
+ struct msghdr msg;
+ struct socket *sock;
+ struct iovec iov;
+ unsigned int cflags;
+ unsigned flags;
+ int ret, min_ret = 0;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ size_t len = sr->len;
+
+ if (!(req->flags & REQ_F_POLLED) &&
+ (sr->flags & IORING_RECVSEND_POLL_FIRST))
+ return -EAGAIN;
+
+ sock = sock_from_file(req->file);
+ if (unlikely(!sock))
+ return -ENOTSOCK;
+
+retry_multishot:
+ if (io_do_buffer_select(req)) {
+ void __user *buf;
+
+ buf = io_buffer_select(req, &len, issue_flags);
+ if (!buf)
+ return -ENOBUFS;
+ sr->buf = buf;
+ }
+
+ ret = import_single_range(READ, sr->buf, len, &iov, &msg.msg_iter);
+ if (unlikely(ret))
+ goto out_free;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_get_inq = 1;
+ msg.msg_flags = 0;
+ msg.msg_controllen = 0;
+ msg.msg_iocb = NULL;
+
+ flags = sr->msg_flags;
+ if (force_nonblock)
+ flags |= MSG_DONTWAIT;
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
+ ret = sock_recvmsg(sock, &msg, flags);
+ if (ret < min_ret) {
+ if (ret == -EAGAIN && force_nonblock) {
+ if ((req->flags & IO_APOLL_MULTI_POLLED) == IO_APOLL_MULTI_POLLED) {
+ io_kbuf_recycle(req, issue_flags);
+ return IOU_ISSUE_SKIP_COMPLETE;
+ }
+
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ if (ret > 0 && io_net_retry(sock, flags)) {
+ sr->len -= ret;
+ sr->buf += ret;
+ sr->done_io += ret;
+ req->flags |= REQ_F_PARTIAL_IO;
+ return -EAGAIN;
+ }
+ req_set_fail(req);
+ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) {
+out_free:
+ req_set_fail(req);
+ }
+
+ if (ret > 0)
+ ret += sr->done_io;
+ else if (sr->done_io)
+ ret = sr->done_io;
+ else
+ io_kbuf_recycle(req, issue_flags);
+
+ cflags = io_put_kbuf(req, issue_flags);
+ if (msg.msg_inq)
+ cflags |= IORING_CQE_F_SOCK_NONEMPTY;
+
+ if (!io_recv_finish(req, &ret, cflags, ret <= 0))
+ goto retry_multishot;
+
+ return ret;
+}
+
+int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_accept *accept = io_kiocb_to_cmd(req);
+ unsigned flags;
+
+ if (sqe->len || sqe->buf_index)
+ return -EINVAL;
+
+ accept->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2));
+ accept->flags = READ_ONCE(sqe->accept_flags);
+ accept->nofile = rlimit(RLIMIT_NOFILE);
+ flags = READ_ONCE(sqe->ioprio);
+ if (flags & ~IORING_ACCEPT_MULTISHOT)
+ return -EINVAL;
+
+ accept->file_slot = READ_ONCE(sqe->file_index);
+ if (accept->file_slot) {
+ if (accept->flags & SOCK_CLOEXEC)
+ return -EINVAL;
+ if (flags & IORING_ACCEPT_MULTISHOT &&
+ accept->file_slot != IORING_FILE_INDEX_ALLOC)
+ return -EINVAL;
+ }
+ if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK))
+ accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK;
+ if (flags & IORING_ACCEPT_MULTISHOT)
+ req->flags |= REQ_F_APOLL_MULTISHOT;
+ return 0;
+}
+
+int io_accept(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_ring_ctx *ctx = req->ctx;
+ struct io_accept *accept = io_kiocb_to_cmd(req);
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+ unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
+ bool fixed = !!accept->file_slot;
+ struct file *file;
+ int ret, fd;
+
+retry:
+ if (!fixed) {
+ fd = __get_unused_fd_flags(accept->flags, accept->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = do_accept(req->file, file_flags, accept->addr, accept->addr_len,
+ accept->flags);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && force_nonblock) {
+ /*
+ * if it's multishot and polled, we don't need to
+ * return EAGAIN to arm the poll infra since it
+ * has already been done
+ */
+ if ((req->flags & IO_APOLL_MULTI_POLLED) ==
+ IO_APOLL_MULTI_POLLED)
+ ret = IOU_ISSUE_SKIP_COMPLETE;
+ return ret;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ accept->file_slot);
+ }
+
+ if (!(req->flags & REQ_F_APOLL_MULTISHOT)) {
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+ }
+
+ if (ret >= 0 &&
+ io_post_aux_cqe(ctx, req->cqe.user_data, ret, IORING_CQE_F_MORE, false))
+ goto retry;
+
+ io_req_set_res(req, ret, 0);
+ if (req->flags & REQ_F_POLLED)
+ return IOU_STOP_MULTISHOT;
+ return IOU_OK;
+}
+
+int io_socket_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_socket *sock = io_kiocb_to_cmd(req);
+
+ if (sqe->addr || sqe->rw_flags || sqe->buf_index)
+ return -EINVAL;
+
+ sock->domain = READ_ONCE(sqe->fd);
+ sock->type = READ_ONCE(sqe->off);
+ sock->protocol = READ_ONCE(sqe->len);
+ sock->file_slot = READ_ONCE(sqe->file_index);
+ sock->nofile = rlimit(RLIMIT_NOFILE);
+
+ sock->flags = sock->type & ~SOCK_TYPE_MASK;
+ if (sock->file_slot && (sock->flags & SOCK_CLOEXEC))
+ return -EINVAL;
+ if (sock->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
+ return -EINVAL;
+ return 0;
+}
+
+int io_socket(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_socket *sock = io_kiocb_to_cmd(req);
+ bool fixed = !!sock->file_slot;
+ struct file *file;
+ int ret, fd;
+
+ if (!fixed) {
+ fd = __get_unused_fd_flags(sock->flags, sock->nofile);
+ if (unlikely(fd < 0))
+ return fd;
+ }
+ file = __sys_socket_file(sock->domain, sock->type, sock->protocol);
+ if (IS_ERR(file)) {
+ if (!fixed)
+ put_unused_fd(fd);
+ ret = PTR_ERR(file);
+ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+ return -EAGAIN;
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+ req_set_fail(req);
+ } else if (!fixed) {
+ fd_install(fd, file);
+ ret = fd;
+ } else {
+ ret = io_fixed_fd_install(req, issue_flags, file,
+ sock->file_slot);
+ }
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
+int io_connect_prep_async(struct io_kiocb *req)
+{
+ struct io_async_connect *io = req->async_data;
+ struct io_connect *conn = io_kiocb_to_cmd(req);
+
+ return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address);
+}
+
+int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+{
+ struct io_connect *conn = io_kiocb_to_cmd(req);
+
+ if (sqe->len || sqe->buf_index || sqe->rw_flags || sqe->splice_fd_in)
+ return -EINVAL;
+
+ conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr));
+ conn->addr_len = READ_ONCE(sqe->addr2);
+ return 0;
+}
+
+int io_connect(struct io_kiocb *req, unsigned int issue_flags)
+{
+ struct io_connect *connect = io_kiocb_to_cmd(req);
+ struct io_async_connect __io, *io;
+ unsigned file_flags;
+ int ret;
+ bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
+
+ if (req_has_async_data(req)) {
+ io = req->async_data;
+ } else {
+ ret = move_addr_to_kernel(connect->addr,
+ connect->addr_len,
+ &__io.address);
+ if (ret)
+ goto out;
+ io = &__io;
+ }
+
+ file_flags = force_nonblock ? O_NONBLOCK : 0;
+
+ ret = __sys_connect_file(req->file, &io->address,
+ connect->addr_len, file_flags);
+ if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
+ if (req_has_async_data(req))
+ return -EAGAIN;
+ if (io_alloc_async_data(req)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ memcpy(req->async_data, &__io, sizeof(__io));
+ return -EAGAIN;
+ }
+ if (ret == -ERESTARTSYS)
+ ret = -EINTR;
+out:
+ if (ret < 0)
+ req_set_fail(req);
+ io_req_set_res(req, ret, 0);
+ return IOU_OK;
+}
+
+void io_netmsg_cache_free(struct io_cache_entry *entry)
+{
+ kfree(container_of(entry, struct io_async_msghdr, cache));
+}
+#endif