summaryrefslogtreecommitdiff
path: root/io_uring
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring')
-rw-r--r--io_uring/io_uring.c3
-rw-r--r--io_uring/kbuf.c1
-rw-r--r--io_uring/kbuf.h1
-rw-r--r--io_uring/msg_ring.c4
-rw-r--r--io_uring/net.c46
-rw-r--r--io_uring/opdef.c1
-rw-r--r--io_uring/poll.c2
-rw-r--r--io_uring/rsrc.c57
-rw-r--r--io_uring/rsrc.h3
-rw-r--r--io_uring/zcrx.c104
-rw-r--r--io_uring/zcrx.h11
11 files changed, 146 insertions, 87 deletions
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 74218c7b7604..211f7f5f507e 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -1647,11 +1647,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
io_req_flags_t io_file_get_flags(struct file *file)
{
+ struct inode *inode = file_inode(file);
io_req_flags_t res = 0;
BUILD_BUG_ON(REQ_F_ISREG_BIT != REQ_F_SUPPORT_NOWAIT_BIT + 1);
- if (S_ISREG(file_inode(file)->i_mode))
+ if (S_ISREG(inode->i_mode) && !(inode->i_flags & S_ANON_INODE))
res |= REQ_F_ISREG;
if ((file->f_flags & O_NONBLOCK) || (file->f_mode & FMODE_NOWAIT))
res |= REQ_F_SUPPORT_NOWAIT;
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index a8467f6aba54..6c676aedc9a2 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -271,6 +271,7 @@ static int io_ring_buffers_peek(struct io_kiocb *req, struct buf_sel_arg *arg,
if (len > arg->max_len) {
len = arg->max_len;
if (!(bl->flags & IOBL_INC)) {
+ arg->partial_map = 1;
if (iov != arg->iovs)
break;
buf->len = len;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h
index 2ec0b983ce24..6eb16ba6ed39 100644
--- a/io_uring/kbuf.h
+++ b/io_uring/kbuf.h
@@ -55,6 +55,7 @@ struct buf_sel_arg {
size_t max_len;
unsigned short nr_iovs;
unsigned short mode;
+ unsigned short partial_map;
};
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
diff --git a/io_uring/msg_ring.c b/io_uring/msg_ring.c
index 50a958e9c921..c73eca5943c0 100644
--- a/io_uring/msg_ring.c
+++ b/io_uring/msg_ring.c
@@ -82,7 +82,7 @@ static void io_msg_tw_complete(struct io_kiocb *req, io_tw_token_t tw)
spin_unlock(&ctx->msg_lock);
}
if (req)
- kmem_cache_free(req_cachep, req);
+ kfree_rcu(req, rcu_head);
percpu_ref_put(&ctx->refs);
}
@@ -90,7 +90,7 @@ static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req,
int res, u32 cflags, u64 user_data)
{
if (!READ_ONCE(ctx->submitter_task)) {
- kmem_cache_free(req_cachep, req);
+ kfree_rcu(req, rcu_head);
return -EOWNERDEAD;
}
req->opcode = IORING_OP_NOP;
diff --git a/io_uring/net.c b/io_uring/net.c
index 3feceb2b5b97..4dac01ff7ac7 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -76,12 +76,17 @@ struct io_sr_msg {
u16 flags;
/* initialised and used only by !msg send variants */
u16 buf_group;
- bool retry;
+ unsigned short retry_flags;
void __user *msg_control;
/* used only for send zerocopy */
struct io_kiocb *notif;
};
+enum sr_retry_flags {
+ IO_SR_MSG_RETRY = 1,
+ IO_SR_MSG_PARTIAL_MAP = 2,
+};
+
/*
* Number of times we'll try and do receives if there's more data. If we
* exceed this limit, then add us to the back of the queue and retry from
@@ -188,7 +193,7 @@ static inline void io_mshot_prep_retry(struct io_kiocb *req,
req->flags &= ~REQ_F_BL_EMPTY;
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
sr->len = 0; /* get from the provided buffer */
req->buf_index = sr->buf_group;
}
@@ -401,7 +406,7 @@ int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
sr->len = READ_ONCE(sqe->len);
sr->flags = READ_ONCE(sqe->ioprio);
if (sr->flags & ~SENDMSG_FLAGS)
@@ -759,7 +764,7 @@ int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg);
sr->done_io = 0;
- sr->retry = false;
+ sr->retry_flags = 0;
if (unlikely(sqe->file_index || sqe->addr2))
return -EINVAL;
@@ -831,7 +836,7 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
cflags |= io_put_kbufs(req, this_ret, io_bundle_nbufs(kmsg, this_ret),
issue_flags);
- if (sr->retry)
+ if (sr->retry_flags & IO_SR_MSG_RETRY)
cflags = req->cqe.flags | (cflags & CQE_F_MASK);
/* bundle with no more immediate buffers, we're done */
if (req->flags & REQ_F_BL_EMPTY)
@@ -840,12 +845,12 @@ static inline bool io_recv_finish(struct io_kiocb *req, int *ret,
* If more is available AND it was a full transfer, retry and
* append to this one
*/
- if (!sr->retry && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
+ if (!sr->retry_flags && kmsg->msg.msg_inq > 1 && this_ret > 0 &&
!iov_iter_count(&kmsg->msg.msg_iter)) {
req->cqe.flags = cflags & ~CQE_F_MASK;
sr->len = kmsg->msg.msg_inq;
sr->done_io += this_ret;
- sr->retry = true;
+ sr->retry_flags |= IO_SR_MSG_RETRY;
return false;
}
} else {
@@ -1084,6 +1089,14 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
if (unlikely(ret < 0))
return ret;
+ if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
+ kmsg->vec.nr = ret;
+ kmsg->vec.iovec = arg.iovs;
+ req->flags |= REQ_F_NEED_CLEANUP;
+ }
+ if (arg.partial_map)
+ sr->retry_flags |= IO_SR_MSG_PARTIAL_MAP;
+
/* special case 1 vec, can be a fast path */
if (ret == 1) {
sr->buf = arg.iovs[0].iov_base;
@@ -1092,11 +1105,6 @@ static int io_recv_buf_select(struct io_kiocb *req, struct io_async_msghdr *kmsg
}
iov_iter_init(&kmsg->msg.msg_iter, ITER_DEST, arg.iovs, ret,
arg.out_len);
- if (arg.iovs != &kmsg->fast_iov && arg.iovs != kmsg->vec.iovec) {
- kmsg->vec.nr = ret;
- kmsg->vec.iovec = arg.iovs;
- req->flags |= REQ_F_NEED_CLEANUP;
- }
} else {
void __user *buf;
@@ -1284,7 +1292,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
int ret;
zc->done_io = 0;
- zc->retry = false;
+ zc->retry_flags = 0;
if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3)))
return -EINVAL;
@@ -1741,9 +1749,11 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
int ret;
bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK;
- if (unlikely(req->flags & REQ_F_FAIL)) {
- ret = -ECONNRESET;
- goto out;
+ if (connect->in_progress) {
+ struct poll_table_struct pt = { ._key = EPOLLERR };
+
+ if (vfs_poll(req->file, &pt) & EPOLLERR)
+ goto get_sock_err;
}
file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -1768,8 +1778,10 @@ int io_connect(struct io_kiocb *req, unsigned int issue_flags)
* which means the previous result is good. For both of these,
* grab the sock_error() and use that for the completion.
*/
- if (ret == -EBADFD || ret == -EISCONN)
+ if (ret == -EBADFD || ret == -EISCONN) {
+get_sock_err:
ret = sock_error(sock_from_file(req->file)->sk);
+ }
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
diff --git a/io_uring/opdef.c b/io_uring/opdef.c
index 489384c0438b..78ef5976bf00 100644
--- a/io_uring/opdef.c
+++ b/io_uring/opdef.c
@@ -216,6 +216,7 @@ const struct io_issue_def io_issue_defs[] = {
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
+ .hash_reg_file = 1,
.prep = io_fallocate_prep,
.issue = io_fallocate,
},
diff --git a/io_uring/poll.c b/io_uring/poll.c
index 8eb744eb9f4c..ddafa88b9fbe 100644
--- a/io_uring/poll.c
+++ b/io_uring/poll.c
@@ -273,8 +273,6 @@ static int io_poll_check_events(struct io_kiocb *req, io_tw_token_t tw)
return IOU_POLL_REISSUE;
}
}
- if (unlikely(req->cqe.res & EPOLLERR))
- req_set_fail(req);
if (req->apoll_events & EPOLLONESHOT)
return IOU_POLL_DONE;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 794d4ae6f0bc..7409af671c9e 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -80,10 +80,21 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
return 0;
}
-int io_buffer_validate(struct iovec *iov)
+int io_validate_user_buf_range(u64 uaddr, u64 ulen)
{
- unsigned long tmp, acct_len = iov->iov_len + (PAGE_SIZE - 1);
+ unsigned long tmp, base = (unsigned long)uaddr;
+ unsigned long acct_len = (unsigned long)PAGE_ALIGN(ulen);
+ /* arbitrary limit, but we need something */
+ if (ulen > SZ_1G || !ulen)
+ return -EFAULT;
+ if (check_add_overflow(base, acct_len, &tmp))
+ return -EOVERFLOW;
+ return 0;
+}
+
+static int io_buffer_validate(struct iovec *iov)
+{
/*
* Don't impose further limits on the size and buffer
* constraints here, we'll -EINVAL later when IO is
@@ -91,17 +102,9 @@ int io_buffer_validate(struct iovec *iov)
*/
if (!iov->iov_base)
return iov->iov_len ? -EFAULT : 0;
- if (!iov->iov_len)
- return -EFAULT;
-
- /* arbitrary limit, but we need something */
- if (iov->iov_len > SZ_1G)
- return -EFAULT;
- if (check_add_overflow((unsigned long)iov->iov_base, acct_len, &tmp))
- return -EOVERFLOW;
-
- return 0;
+ return io_validate_user_buf_range((unsigned long)iov->iov_base,
+ iov->iov_len);
}
static void io_release_ubuf(void *priv)
@@ -109,8 +112,11 @@ static void io_release_ubuf(void *priv)
struct io_mapped_ubuf *imu = priv;
unsigned int i;
- for (i = 0; i < imu->nr_bvecs; i++)
- unpin_user_page(imu->bvec[i].bv_page);
+ for (i = 0; i < imu->nr_bvecs; i++) {
+ struct folio *folio = page_folio(imu->bvec[i].bv_page);
+
+ unpin_user_folio(folio, 1);
+ }
}
static struct io_mapped_ubuf *io_alloc_imu(struct io_ring_ctx *ctx,
@@ -732,6 +738,7 @@ bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
data->nr_pages_mid = folio_nr_pages(folio);
data->folio_shift = folio_shift(folio);
+ data->first_folio_page_idx = folio_page_idx(folio, page_array[0]);
/*
* Check if pages are contiguous inside a folio, and all folios have
@@ -825,7 +832,11 @@ static struct io_rsrc_node *io_sqe_buffer_register(struct io_ring_ctx *ctx,
if (coalesced)
imu->folio_shift = data.folio_shift;
refcount_set(&imu->refs, 1);
- off = (unsigned long) iov->iov_base & ((1UL << imu->folio_shift) - 1);
+
+ off = (unsigned long)iov->iov_base & ~PAGE_MASK;
+ if (coalesced)
+ off += data.first_folio_page_idx << PAGE_SHIFT;
+
node->buf = imu;
ret = 0;
@@ -841,8 +852,10 @@ done:
if (ret) {
if (imu)
io_free_imu(ctx, imu);
- if (pages)
- unpin_user_pages(pages, nr_pages);
+ if (pages) {
+ for (i = 0; i < nr_pages; i++)
+ unpin_user_folio(page_folio(pages[i]), 1);
+ }
io_cache_free(&ctx->node_cache, node);
node = ERR_PTR(ret);
}
@@ -1326,7 +1339,6 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
{
unsigned long folio_size = 1 << imu->folio_shift;
unsigned long folio_mask = folio_size - 1;
- u64 folio_addr = imu->ubuf & ~folio_mask;
struct bio_vec *res_bvec = vec->bvec;
size_t total_len = 0;
unsigned bvec_idx = 0;
@@ -1348,8 +1360,13 @@ static int io_vec_fill_bvec(int ddir, struct iov_iter *iter,
if (unlikely(check_add_overflow(total_len, iov_len, &total_len)))
return -EOVERFLOW;
- /* by using folio address it also accounts for bvec offset */
- offset = buf_addr - folio_addr;
+ offset = buf_addr - imu->ubuf;
+ /*
+ * Only the first bvec can have non zero bv_offset, account it
+ * here and work with full folios below.
+ */
+ offset += imu->bvec[0].bv_offset;
+
src_bvec = imu->bvec + (offset >> imu->folio_shift);
offset &= folio_mask;
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index b52242852ff3..ba00ee6f0650 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -49,6 +49,7 @@ struct io_imu_folio_data {
unsigned int nr_pages_mid;
unsigned int folio_shift;
unsigned int nr_folios;
+ unsigned long first_folio_page_idx;
};
bool io_rsrc_cache_init(struct io_ring_ctx *ctx);
@@ -83,7 +84,7 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg,
unsigned size, unsigned type);
int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg,
unsigned int size, unsigned int type);
-int io_buffer_validate(struct iovec *iov);
+int io_validate_user_buf_range(u64 uaddr, u64 ulen);
bool io_check_coalesce_buffer(struct page **page_array, int nr_pages,
struct io_imu_folio_data *data);
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index fe86606b9f30..adb1e426987e 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -26,12 +26,61 @@
#include "zcrx.h"
#include "rsrc.h"
+#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp)
{
return pp->mp_priv;
}
-#define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct io_zcrx_area, nia);
+}
+
+static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
+{
+ struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
+
+ return area->mem.pages[net_iov_idx(niov)];
+}
+
+static void io_release_area_mem(struct io_zcrx_mem *mem)
+{
+ if (mem->pages) {
+ unpin_user_pages(mem->pages, mem->nr_folios);
+ kvfree(mem->pages);
+ }
+}
+
+static int io_import_area(struct io_zcrx_ifq *ifq,
+ struct io_zcrx_mem *mem,
+ struct io_uring_zcrx_area_reg *area_reg)
+{
+ struct page **pages;
+ int nr_pages;
+ int ret;
+
+ ret = io_validate_user_buf_range(area_reg->addr, area_reg->len);
+ if (ret)
+ return ret;
+ if (!area_reg->addr)
+ return -EFAULT;
+ if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
+ return -EINVAL;
+
+ pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
+ &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ mem->pages = pages;
+ mem->nr_folios = nr_pages;
+ mem->size = area_reg->len;
+ return 0;
+}
static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq,
struct io_zcrx_area *area, int nr_mapped)
@@ -70,8 +119,8 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area)
struct net_iov *niov = &area->nia.niovs[i];
dma_addr_t dma;
- dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE,
- DMA_FROM_DEVICE, IO_DMA_ATTR);
+ dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0,
+ PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR);
if (dma_mapping_error(ifq->dev, dma))
break;
if (net_mp_niov_set_dma_addr(niov, dma)) {
@@ -118,13 +167,6 @@ struct io_zcrx_args {
static const struct memory_provider_ops io_uring_pp_zc_ops;
-static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov)
-{
- struct net_iov_area *owner = net_iov_owner(niov);
-
- return container_of(owner, struct io_zcrx_area, nia);
-}
-
static inline atomic_t *io_get_user_counter(struct net_iov *niov)
{
struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
@@ -147,13 +189,6 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov)
atomic_inc(io_get_user_counter(niov));
}
-static inline struct page *io_zcrx_iov_page(const struct net_iov *niov)
-{
- struct io_zcrx_area *area = io_zcrx_iov_to_area(niov);
-
- return area->pages[net_iov_idx(niov)];
-}
-
static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_ifq_reg *reg,
struct io_uring_region_desc *rd)
@@ -187,15 +222,13 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
static void io_zcrx_free_area(struct io_zcrx_area *area)
{
- io_zcrx_unmap_area(area->ifq, area);
+ if (area->ifq)
+ io_zcrx_unmap_area(area->ifq, area);
+ io_release_area_mem(&area->mem);
kvfree(area->freelist);
kvfree(area->nia.niovs);
kvfree(area->user_refs);
- if (area->pages) {
- unpin_user_pages(area->pages, area->nr_folios);
- kvfree(area->pages);
- }
kfree(area);
}
@@ -204,37 +237,27 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq,
struct io_uring_zcrx_area_reg *area_reg)
{
struct io_zcrx_area *area;
- int i, ret, nr_pages, nr_iovs;
- struct iovec iov;
+ unsigned nr_iovs;
+ int i, ret;
if (area_reg->flags || area_reg->rq_area_token)
return -EINVAL;
if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1])
return -EINVAL;
- if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK)
- return -EINVAL;
-
- iov.iov_base = u64_to_user_ptr(area_reg->addr);
- iov.iov_len = area_reg->len;
- ret = io_buffer_validate(&iov);
- if (ret)
- return ret;
ret = -ENOMEM;
area = kzalloc(sizeof(*area), GFP_KERNEL);
if (!area)
goto err;
- area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len,
- &nr_pages);
- if (IS_ERR(area->pages)) {
- ret = PTR_ERR(area->pages);
- area->pages = NULL;
+ ret = io_import_area(ifq, &area->mem, area_reg);
+ if (ret)
goto err;
- }
- area->nr_folios = nr_iovs = nr_pages;
+
+ nr_iovs = area->mem.size >> PAGE_SHIFT;
area->nia.num_niovs = nr_iovs;
+ ret = -ENOMEM;
area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]),
GFP_KERNEL | __GFP_ZERO);
if (!area->nia.niovs)
@@ -653,10 +676,7 @@ static int io_pp_zc_init(struct page_pool *pp)
static void io_pp_zc_destroy(struct page_pool *pp)
{
struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp);
- struct io_zcrx_area *area = ifq->area;
- if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs))
- return;
percpu_ref_put(&ifq->ctx->refs);
}
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index f2bc811f022c..64796c90851e 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -7,6 +7,13 @@
#include <net/page_pool/types.h>
#include <net/net_trackers.h>
+struct io_zcrx_mem {
+ unsigned long size;
+
+ struct page **pages;
+ unsigned long nr_folios;
+};
+
struct io_zcrx_area {
struct net_iov_area nia;
struct io_zcrx_ifq *ifq;
@@ -14,13 +21,13 @@ struct io_zcrx_area {
bool is_mapped;
u16 area_id;
- struct page **pages;
- unsigned long nr_folios;
/* freelist */
spinlock_t freelist_lock ____cacheline_aligned_in_smp;
u32 free_count;
u32 *freelist;
+
+ struct io_zcrx_mem mem;
};
struct io_zcrx_ifq {