summaryrefslogtreecommitdiff
path: root/io_uring/kbuf.c
diff options
context:
space:
mode:
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r--io_uring/kbuf.c246
1 files changed, 84 insertions, 162 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c
index d407576ddfb7..8e72de7712ac 100644
--- a/io_uring/kbuf.c
+++ b/io_uring/kbuf.c
@@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx,
/*
* Store buffer group ID and finally mark the list as visible.
* The normal lookup doesn't care about the visibility as we're
- * always under the ->uring_lock, but the RCU lookup from mmap does.
+ * always under the ->uring_lock, but lookups from mmap do.
*/
bl->bgid = bgid;
- atomic_set(&bl->refs, 1);
+ guard(mutex)(&ctx->mmap_lock);
return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL));
}
@@ -139,6 +139,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
struct io_uring_buf_ring *br = bl->buf_ring;
__u16 tail, head = bl->head;
struct io_uring_buf *buf;
+ void __user *ret;
tail = smp_load_acquire(&br->tail);
if (unlikely(tail == head))
@@ -153,6 +154,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
req->flags |= REQ_F_BUFFER_RING | REQ_F_BUFFERS_COMMIT;
req->buf_list = bl;
req->buf_index = buf->bid;
+ ret = u64_to_user_ptr(buf->addr);
if (issue_flags & IO_URING_F_UNLOCKED || !io_file_can_poll(req)) {
/*
@@ -168,7 +170,7 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len,
io_kbuf_commit(req, bl, *len, 1);
req->buf_list = NULL;
}
- return u64_to_user_ptr(buf->addr);
+ return ret;
}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len,
@@ -351,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
if (bl->flags & IOBL_BUF_RING) {
i = bl->buf_ring->tail - bl->head;
- if (bl->buf_nr_pages) {
- int j;
-
- if (!(bl->flags & IOBL_MMAP)) {
- for (j = 0; j < bl->buf_nr_pages; j++)
- unpin_user_page(bl->buf_pages[j]);
- }
- io_pages_unmap(bl->buf_ring, &bl->buf_pages,
- &bl->buf_nr_pages, bl->flags & IOBL_MMAP);
- bl->flags &= ~IOBL_MMAP;
- }
+ io_free_region(ctx, &bl->region);
/* make sure it's seen as empty */
INIT_LIST_HEAD(&bl->buf_list);
bl->flags &= ~IOBL_BUF_RING;
@@ -384,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx,
return i;
}
-void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
{
- if (atomic_dec_and_test(&bl->refs)) {
- __io_remove_buffers(ctx, bl, -1U);
- kfree_rcu(bl, rcu);
- }
+ __io_remove_buffers(ctx, bl, -1U);
+ kfree(bl);
}
void io_destroy_buffers(struct io_ring_ctx *ctx)
@@ -397,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
struct io_buffer_list *bl;
struct list_head *item, *tmp;
struct io_buffer *buf;
- unsigned long index;
- xa_for_each(&ctx->io_bl_xa, index, bl) {
- xa_erase(&ctx->io_bl_xa, bl->bgid);
+ while (1) {
+ unsigned long index = 0;
+
+ scoped_guard(mutex, &ctx->mmap_lock) {
+ bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT);
+ if (bl)
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+ }
+ if (!bl)
+ break;
io_put_bl(ctx, bl);
}
@@ -418,6 +415,13 @@ void io_destroy_buffers(struct io_ring_ctx *ctx)
}
}
+static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl)
+{
+ scoped_guard(mutex, &ctx->mmap_lock)
+ WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl);
+ io_put_bl(ctx, bl);
+}
+
int io_remove_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
{
struct io_provide_buf *p = io_kiocb_to_cmd(req, struct io_provide_buf);
@@ -589,11 +593,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags)
INIT_LIST_HEAD(&bl->buf_list);
ret = io_buffer_add_list(ctx, bl, p->bgid);
if (ret) {
- /*
- * Doesn't need rcu free as it was never visible, but
- * let's keep it consistent throughout.
- */
- kfree_rcu(bl, rcu);
+ kfree(bl);
goto err;
}
}
@@ -613,75 +613,14 @@ err:
return IOU_OK;
}
-static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg,
- struct io_buffer_list *bl)
-{
- struct io_uring_buf_ring *br = NULL;
- struct page **pages;
- int nr_pages, ret;
-
- pages = io_pin_pages(reg->ring_addr,
- flex_array_size(br, bufs, reg->ring_entries),
- &nr_pages);
- if (IS_ERR(pages))
- return PTR_ERR(pages);
-
- br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
- if (!br) {
- ret = -ENOMEM;
- goto error_unpin;
- }
-
-#ifdef SHM_COLOUR
- /*
- * On platforms that have specific aliasing requirements, SHM_COLOUR
- * is set and we must guarantee that the kernel and user side align
- * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
- * the application mmap's the provided ring buffer. Fail the request
- * if we, by chance, don't end up with aligned addresses. The app
- * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
- * this transparently.
- */
- if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) {
- ret = -EINVAL;
- goto error_unpin;
- }
-#endif
- bl->buf_pages = pages;
- bl->buf_nr_pages = nr_pages;
- bl->buf_ring = br;
- bl->flags |= IOBL_BUF_RING;
- bl->flags &= ~IOBL_MMAP;
- return 0;
-error_unpin:
- unpin_user_pages(pages, nr_pages);
- kvfree(pages);
- vunmap(br);
- return ret;
-}
-
-static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx,
- struct io_uring_buf_reg *reg,
- struct io_buffer_list *bl)
-{
- size_t ring_size;
-
- ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring);
-
- bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size);
- if (IS_ERR(bl->buf_ring)) {
- bl->buf_ring = NULL;
- return -ENOMEM;
- }
-
- bl->flags |= (IOBL_BUF_RING | IOBL_MMAP);
- return 0;
-}
-
int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
{
struct io_uring_buf_reg reg;
struct io_buffer_list *bl, *free_bl = NULL;
+ struct io_uring_region_desc rd;
+ struct io_uring_buf_ring *br;
+ unsigned long mmap_offset;
+ unsigned long ring_size;
int ret;
lockdep_assert_held(&ctx->uring_lock);
@@ -693,19 +632,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
return -EINVAL;
if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC))
return -EINVAL;
- if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
- if (!reg.ring_addr)
- return -EFAULT;
- if (reg.ring_addr & ~PAGE_MASK)
- return -EINVAL;
- } else {
- if (reg.ring_addr)
- return -EINVAL;
- }
-
if (!is_power_of_2(reg.ring_entries))
return -EINVAL;
-
/* cannot disambiguate full vs empty due to head/tail size */
if (reg.ring_entries >= 65536)
return -EINVAL;
@@ -715,28 +643,55 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
/* if mapped buffer ring OR classic exists, don't allow */
if (bl->flags & IOBL_BUF_RING || !list_empty(&bl->buf_list))
return -EEXIST;
- } else {
- free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
- if (!bl)
- return -ENOMEM;
+ io_destroy_bl(ctx, bl);
}
- if (!(reg.flags & IOU_PBUF_RING_MMAP))
- ret = io_pin_pbuf_ring(&reg, bl);
- else
- ret = io_alloc_pbuf_ring(ctx, &reg, bl);
+ free_bl = bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+ if (!bl)
+ return -ENOMEM;
- if (!ret) {
- bl->nr_entries = reg.ring_entries;
- bl->mask = reg.ring_entries - 1;
- if (reg.flags & IOU_PBUF_RING_INC)
- bl->flags |= IOBL_INC;
+ mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT;
+ ring_size = flex_array_size(br, bufs, reg.ring_entries);
- io_buffer_add_list(ctx, bl, reg.bgid);
- return 0;
+ memset(&rd, 0, sizeof(rd));
+ rd.size = PAGE_ALIGN(ring_size);
+ if (!(reg.flags & IOU_PBUF_RING_MMAP)) {
+ rd.user_addr = reg.ring_addr;
+ rd.flags |= IORING_MEM_REGION_TYPE_USER;
+ }
+ ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset);
+ if (ret)
+ goto fail;
+ br = io_region_get_ptr(&bl->region);
+
+#ifdef SHM_COLOUR
+ /*
+ * On platforms that have specific aliasing requirements, SHM_COLOUR
+ * is set and we must guarantee that the kernel and user side align
+ * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and
+ * the application mmap's the provided ring buffer. Fail the request
+ * if we, by chance, don't end up with aligned addresses. The app
+ * should use IOU_PBUF_RING_MMAP instead, and liburing will handle
+ * this transparently.
+ */
+ if (!(reg.flags & IOU_PBUF_RING_MMAP) &&
+ ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) {
+ ret = -EINVAL;
+ goto fail;
}
+#endif
- kfree_rcu(free_bl, rcu);
+ bl->nr_entries = reg.ring_entries;
+ bl->mask = reg.ring_entries - 1;
+ bl->flags |= IOBL_BUF_RING;
+ bl->buf_ring = br;
+ if (reg.flags & IOU_PBUF_RING_INC)
+ bl->flags |= IOBL_INC;
+ io_buffer_add_list(ctx, bl, reg.bgid);
+ return 0;
+fail:
+ io_free_region(ctx, &bl->region);
+ kfree(free_bl);
return ret;
}
@@ -760,7 +715,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg)
if (!(bl->flags & IOBL_BUF_RING))
return -EINVAL;
- xa_erase(&ctx->io_bl_xa, bl->bgid);
+ scoped_guard(mutex, &ctx->mmap_lock)
+ xa_erase(&ctx->io_bl_xa, bl->bgid);
+
io_put_bl(ctx, bl);
return 0;
}
@@ -791,50 +748,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg)
return 0;
}
-struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx,
- unsigned long bgid)
+struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx,
+ unsigned int bgid)
{
struct io_buffer_list *bl;
- bool ret;
- /*
- * We have to be a bit careful here - we're inside mmap and cannot grab
- * the uring_lock. This means the buffer_list could be simultaneously
- * going away, if someone is trying to be sneaky. Look it up under rcu
- * so we know it's not going away, and attempt to grab a reference to
- * it. If the ref is already zero, then fail the mapping. If successful,
- * the caller will call io_put_bl() to drop the the reference at at the
- * end. This may then safely free the buffer_list (and drop the pages)
- * at that point, vm_insert_pages() would've already grabbed the
- * necessary vma references.
- */
- rcu_read_lock();
- bl = xa_load(&ctx->io_bl_xa, bgid);
- /* must be a mmap'able buffer ring and have pages */
- ret = false;
- if (bl && bl->flags & IOBL_MMAP)
- ret = atomic_inc_not_zero(&bl->refs);
- rcu_read_unlock();
+ lockdep_assert_held(&ctx->mmap_lock);
- if (ret)
- return bl;
-
- return ERR_PTR(-EINVAL);
-}
-
-int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma)
-{
- struct io_ring_ctx *ctx = file->private_data;
- loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT;
- struct io_buffer_list *bl;
- int bgid, ret;
-
- bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT;
- bl = io_pbuf_get_bl(ctx, bgid);
- if (IS_ERR(bl))
- return PTR_ERR(bl);
-
- ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages);
- io_put_bl(ctx, bl);
- return ret;
+ bl = xa_load(&ctx->io_bl_xa, bgid);
+ if (!bl || !(bl->flags & IOBL_BUF_RING))
+ return NULL;
+ return &bl->region;
}