diff options
Diffstat (limited to 'io_uring/kbuf.c')
-rw-r--r-- | io_uring/kbuf.c | 231 |
1 files changed, 73 insertions, 158 deletions
diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e1895952066e..b224c03056c5 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -45,10 +45,10 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; - atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -353,17 +353,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->flags & IOBL_BUF_RING) { i = bl->buf_ring->tail - bl->head; - if (bl->buf_nr_pages) { - int j; - - if (!(bl->flags & IOBL_MMAP)) { - for (j = 0; j < bl->buf_nr_pages; j++) - unpin_user_page(bl->buf_pages[j]); - } - io_pages_unmap(bl->buf_ring, &bl->buf_pages, - &bl->buf_nr_pages, bl->flags & IOBL_MMAP); - bl->flags &= ~IOBL_MMAP; - } + io_free_region(ctx, &bl->region); /* make sure it's seen as empty */ INIT_LIST_HEAD(&bl->buf_list); bl->flags &= ~IOBL_BUF_RING; @@ -386,12 +376,10 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return i; } -void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) +static void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - if (atomic_dec_and_test(&bl->refs)) { - __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); - } + __io_remove_buffers(ctx, bl, -1U); + kfree(bl); } void io_destroy_buffers(struct io_ring_ctx *ctx) @@ -399,10 +387,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } @@ -422,7 +417,8 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) static void io_destroy_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + WARN_ON_ONCE(xa_erase(&ctx->io_bl_xa, bl->bgid) != bl); io_put_bl(ctx, bl); } @@ -484,6 +480,8 @@ int io_provide_buffers_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe p->nbufs = tmp; p->addr = READ_ONCE(sqe->addr); p->len = READ_ONCE(sqe->len); + if (!p->len) + return -EINVAL; if (check_mul_overflow((unsigned long)p->len, (unsigned long)p->nbufs, &size)) @@ -597,11 +595,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); + kfree(bl); goto err; } } @@ -621,75 +615,14 @@ err: return IOU_OK; } -static int io_pin_pbuf_ring(struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - struct io_uring_buf_ring *br = NULL; - struct page **pages; - int nr_pages, ret; - - pages = io_pin_pages(reg->ring_addr, - flex_array_size(br, bufs, reg->ring_entries), - &nr_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - br = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - if (!br) { - ret = -ENOMEM; - goto error_unpin; - } - -#ifdef SHM_COLOUR - /* - * On platforms that have specific aliasing requirements, SHM_COLOUR - * is set and we must guarantee that the kernel and user side align - * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and - * the application mmap's the provided ring buffer. Fail the request - * if we, by chance, don't end up with aligned addresses. The app - * should use IOU_PBUF_RING_MMAP instead, and liburing will handle - * this transparently. - */ - if ((reg->ring_addr | (unsigned long) br) & (SHM_COLOUR - 1)) { - ret = -EINVAL; - goto error_unpin; - } -#endif - bl->buf_pages = pages; - bl->buf_nr_pages = nr_pages; - bl->buf_ring = br; - bl->flags |= IOBL_BUF_RING; - bl->flags &= ~IOBL_MMAP; - return 0; -error_unpin: - unpin_user_pages(pages, nr_pages); - kvfree(pages); - vunmap(br); - return ret; -} - -static int io_alloc_pbuf_ring(struct io_ring_ctx *ctx, - struct io_uring_buf_reg *reg, - struct io_buffer_list *bl) -{ - size_t ring_size; - - ring_size = reg->ring_entries * sizeof(struct io_uring_buf_ring); - - bl->buf_ring = io_pages_map(&bl->buf_pages, &bl->buf_nr_pages, ring_size); - if (IS_ERR(bl->buf_ring)) { - bl->buf_ring = NULL; - return -ENOMEM; - } - - bl->flags |= (IOBL_BUF_RING | IOBL_MMAP); - return 0; -} - int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) { struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; + struct io_uring_region_desc rd; + struct io_uring_buf_ring *br; + unsigned long mmap_offset; + unsigned long ring_size; int ret; lockdep_assert_held(&ctx->uring_lock); @@ -701,19 +634,8 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -EINVAL; if (reg.flags & ~(IOU_PBUF_RING_MMAP | IOU_PBUF_RING_INC)) return -EINVAL; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) { - if (!reg.ring_addr) - return -EFAULT; - if (reg.ring_addr & ~PAGE_MASK) - return -EINVAL; - } else { - if (reg.ring_addr) - return -EINVAL; - } - if (!is_power_of_2(reg.ring_entries)) return -EINVAL; - /* cannot disambiguate full vs empty due to head/tail size */ if (reg.ring_entries >= 65536) return -EINVAL; @@ -730,22 +652,48 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!bl) return -ENOMEM; - if (!(reg.flags & IOU_PBUF_RING_MMAP)) - ret = io_pin_pbuf_ring(®, bl); - else - ret = io_alloc_pbuf_ring(ctx, ®, bl); + mmap_offset = (unsigned long)reg.bgid << IORING_OFF_PBUF_SHIFT; + ring_size = flex_array_size(br, bufs, reg.ring_entries); - if (!ret) { - bl->nr_entries = reg.ring_entries; - bl->mask = reg.ring_entries - 1; - if (reg.flags & IOU_PBUF_RING_INC) - bl->flags |= IOBL_INC; + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(ring_size); + if (!(reg.flags & IOU_PBUF_RING_MMAP)) { + rd.user_addr = reg.ring_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &bl->region, &rd, mmap_offset); + if (ret) + goto fail; + br = io_region_get_ptr(&bl->region); - io_buffer_add_list(ctx, bl, reg.bgid); - return 0; +#ifdef SHM_COLOUR + /* + * On platforms that have specific aliasing requirements, SHM_COLOUR + * is set and we must guarantee that the kernel and user side align + * nicely. We cannot do that if IOU_PBUF_RING_MMAP isn't set and + * the application mmap's the provided ring buffer. Fail the request + * if we, by chance, don't end up with aligned addresses. The app + * should use IOU_PBUF_RING_MMAP instead, and liburing will handle + * this transparently. + */ + if (!(reg.flags & IOU_PBUF_RING_MMAP) && + ((reg.ring_addr | (unsigned long)br) & (SHM_COLOUR - 1))) { + ret = -EINVAL; + goto fail; } +#endif - kfree_rcu(free_bl, rcu); + bl->nr_entries = reg.ring_entries; + bl->mask = reg.ring_entries - 1; + bl->flags |= IOBL_BUF_RING; + bl->buf_ring = br; + if (reg.flags & IOU_PBUF_RING_INC) + bl->flags |= IOBL_INC; + io_buffer_add_list(ctx, bl, reg.bgid); + return 0; +fail: + io_free_region(ctx, &bl->region); + kfree(free_bl); return ret; } @@ -769,7 +717,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -800,50 +750,15 @@ int io_register_pbuf_status(struct io_ring_ctx *ctx, void __user *arg) return 0; } -struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, - unsigned long bgid) -{ - struct io_buffer_list *bl; - bool ret; - - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); - bl = xa_load(&ctx->io_bl_xa, bgid); - /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; - - return ERR_PTR(-EINVAL); -} - -int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) +struct io_mapped_region *io_pbuf_get_region(struct io_ring_ctx *ctx, + unsigned int bgid) { - struct io_ring_ctx *ctx = file->private_data; - loff_t pgoff = vma->vm_pgoff << PAGE_SHIFT; struct io_buffer_list *bl; - int bgid, ret; - bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; - bl = io_pbuf_get_bl(ctx, bgid); - if (IS_ERR(bl)) - return PTR_ERR(bl); + lockdep_assert_held(&ctx->mmap_lock); - ret = io_uring_mmap_pages(ctx, vma, bl->buf_pages, bl->buf_nr_pages); - io_put_bl(ctx, bl); - return ret; + bl = xa_load(&ctx->io_bl_xa, bgid); + if (!bl || !(bl->flags & IOBL_BUF_RING)) + return NULL; + return &bl->region; } |