diff options
Diffstat (limited to 'io_uring/zcrx.c')
-rw-r--r-- | io_uring/zcrx.c | 407 |
1 files changed, 318 insertions, 89 deletions
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index 80d4a6f71d29..1513431587a7 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -28,29 +28,215 @@ #define IO_DMA_ATTR (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING) -static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, - struct io_zcrx_area *area, int nr_mapped) +static inline struct io_zcrx_ifq *io_pp_to_ifq(struct page_pool *pp) +{ + return pp->mp_priv; +} + +static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) +{ + struct net_iov_area *owner = net_iov_owner(niov); + + return container_of(owner, struct io_zcrx_area, nia); +} + +static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + return area->mem.pages[net_iov_idx(niov)]; +} + +static void io_release_dmabuf(struct io_zcrx_mem *mem) +{ + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return; + + if (mem->sgt) + dma_buf_unmap_attachment_unlocked(mem->attach, mem->sgt, + DMA_FROM_DEVICE); + if (mem->attach) + dma_buf_detach(mem->dmabuf, mem->attach); + if (mem->dmabuf) + dma_buf_put(mem->dmabuf); + + mem->sgt = NULL; + mem->attach = NULL; + mem->dmabuf = NULL; +} + +static int io_import_dmabuf(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + unsigned long off = (unsigned long)area_reg->addr; + unsigned long len = (unsigned long)area_reg->len; + unsigned long total_size = 0; + struct scatterlist *sg; + int dmabuf_fd = area_reg->dmabuf_fd; + int i, ret; + + if (WARN_ON_ONCE(!ifq->dev)) + return -EFAULT; + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + mem->is_dmabuf = true; + mem->dmabuf = dma_buf_get(dmabuf_fd); + if (IS_ERR(mem->dmabuf)) { + ret = PTR_ERR(mem->dmabuf); + mem->dmabuf = NULL; + goto err; + } + + mem->attach = dma_buf_attach(mem->dmabuf, ifq->dev); + if (IS_ERR(mem->attach)) { + ret = PTR_ERR(mem->attach); + mem->attach = NULL; + goto err; + } + + mem->sgt = dma_buf_map_attachment_unlocked(mem->attach, DMA_FROM_DEVICE); + if (IS_ERR(mem->sgt)) { + ret = PTR_ERR(mem->sgt); + mem->sgt = NULL; + goto err; + } + + for_each_sgtable_dma_sg(mem->sgt, sg, i) + total_size += sg_dma_len(sg); + + if (total_size < off + len) + return -EINVAL; + + mem->dmabuf_offset = off; + mem->size = len; + return 0; +err: + io_release_dmabuf(mem); + return ret; +} + +static int io_zcrx_map_area_dmabuf(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned long off = area->mem.dmabuf_offset; + struct scatterlist *sg; + unsigned i, niov_idx = 0; + + if (!IS_ENABLED(CONFIG_DMA_SHARED_BUFFER)) + return -EINVAL; + + for_each_sgtable_dma_sg(area->mem.sgt, sg, i) { + dma_addr_t dma = sg_dma_address(sg); + unsigned long sg_len = sg_dma_len(sg); + unsigned long sg_off = min(sg_len, off); + + off -= sg_off; + sg_len -= sg_off; + dma += sg_off; + + while (sg_len && niov_idx < area->nia.num_niovs) { + struct net_iov *niov = &area->nia.niovs[niov_idx]; + + if (net_mp_niov_set_dma_addr(niov, dma)) + return 0; + sg_len -= PAGE_SIZE; + dma += PAGE_SIZE; + niov_idx++; + } + } + return niov_idx; +} + +static int io_import_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + struct page **pages; + int nr_pages; + + if (area_reg->dmabuf_fd) + return -EINVAL; + if (!area_reg->addr) + return -EFAULT; + pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, + &nr_pages); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + mem->pages = pages; + mem->nr_folios = nr_pages; + mem->size = area_reg->len; + return 0; +} + +static void io_release_area_mem(struct io_zcrx_mem *mem) +{ + if (mem->is_dmabuf) { + io_release_dmabuf(mem); + return; + } + if (mem->pages) { + unpin_user_pages(mem->pages, mem->nr_folios); + kvfree(mem->pages); + } +} + +static int io_import_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_mem *mem, + struct io_uring_zcrx_area_reg *area_reg) +{ + int ret; + + ret = io_validate_user_buf_range(area_reg->addr, area_reg->len); + if (ret) + return ret; + if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + return -EINVAL; + + if (area_reg->flags & IORING_ZCRX_AREA_DMABUF) + return io_import_dmabuf(ifq, mem, area_reg); + return io_import_umem(ifq, mem, area_reg); +} + +static void io_zcrx_unmap_umem(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) { int i; for (i = 0; i < nr_mapped; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - dma_addr_t dma; + netmem_ref netmem = net_iov_to_netmem(&area->nia.niovs[i]); + dma_addr_t dma = page_pool_get_dma_addr_netmem(netmem); - dma = page_pool_get_dma_addr_netmem(net_iov_to_netmem(niov)); dma_unmap_page_attrs(ifq->dev, dma, PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); - net_mp_niov_set_dma_addr(niov, 0); } } +static void __io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area, int nr_mapped) +{ + int i; + + if (area->mem.is_dmabuf) + io_release_dmabuf(&area->mem); + else + io_zcrx_unmap_umem(ifq, area, nr_mapped); + + for (i = 0; i < area->nia.num_niovs; i++) + net_mp_niov_set_dma_addr(&area->nia.niovs[i], 0); +} + static void io_zcrx_unmap_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) __io_zcrx_unmap_area(ifq, area, area->nia.num_niovs); + area->is_mapped = false; } -static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +static int io_zcrx_map_area_umem(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) { int i; @@ -58,8 +244,8 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) struct net_iov *niov = &area->nia.niovs[i]; dma_addr_t dma; - dma = dma_map_page_attrs(ifq->dev, area->pages[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE, IO_DMA_ATTR); + dma = dma_map_page_attrs(ifq->dev, area->mem.pages[i], 0, + PAGE_SIZE, DMA_FROM_DEVICE, IO_DMA_ATTR); if (dma_mapping_error(ifq->dev, dma)) break; if (net_mp_niov_set_dma_addr(niov, dma)) { @@ -68,9 +254,24 @@ static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) break; } } + return i; +} - if (i != area->nia.num_niovs) { - __io_zcrx_unmap_area(ifq, area, i); +static int io_zcrx_map_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area *area) +{ + unsigned nr; + + guard(mutex)(&ifq->dma_lock); + if (area->is_mapped) + return 0; + + if (area->mem.is_dmabuf) + nr = io_zcrx_map_area_dmabuf(ifq, area); + else + nr = io_zcrx_map_area_umem(ifq, area); + + if (nr != area->nia.num_niovs) { + __io_zcrx_unmap_area(ifq, area, nr); return -EINVAL; } @@ -106,13 +307,6 @@ struct io_zcrx_args { static const struct memory_provider_ops io_uring_pp_zc_ops; -static inline struct io_zcrx_area *io_zcrx_iov_to_area(const struct net_iov *niov) -{ - struct net_iov_area *owner = net_iov_owner(niov); - - return container_of(owner, struct io_zcrx_area, nia); -} - static inline atomic_t *io_get_user_counter(struct net_iov *niov) { struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); @@ -135,17 +329,12 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static inline struct page *io_zcrx_iov_page(const struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - return area->pages[net_iov_idx(niov)]; -} - static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, - struct io_uring_region_desc *rd) + struct io_uring_region_desc *rd, + u32 id) { + u64 mmap_offset; size_t off, size; void *ptr; int ret; @@ -155,12 +344,14 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, if (size > rd->size) return -EINVAL; - ret = io_create_region_mmap_safe(ifq->ctx, &ifq->ctx->zcrx_region, rd, - IORING_MAP_OFF_ZCRX_REGION); + mmap_offset = IORING_MAP_OFF_ZCRX_REGION; + mmap_offset += id << IORING_OFF_PBUF_SHIFT; + + ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; - ptr = io_region_get_ptr(&ifq->ctx->zcrx_region); + ptr = io_region_get_ptr(&ifq->region); ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); return 0; @@ -168,7 +359,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->ctx->zcrx_region); + io_free_region(ifq->ctx, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } @@ -176,79 +367,69 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) static void io_zcrx_free_area(struct io_zcrx_area *area) { io_zcrx_unmap_area(area->ifq, area); + io_release_area_mem(&area->mem); kvfree(area->freelist); kvfree(area->nia.niovs); kvfree(area->user_refs); - if (area->pages) { - unpin_user_pages(area->pages, area->nia.num_niovs); - kvfree(area->pages); - } kfree(area); } +#define IO_ZCRX_AREA_SUPPORTED_FLAGS (IORING_ZCRX_AREA_DMABUF) + static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, struct io_zcrx_area **res, struct io_uring_zcrx_area_reg *area_reg) { struct io_zcrx_area *area; - int i, ret, nr_pages; - struct iovec iov; + unsigned nr_iovs; + int i, ret; - if (area_reg->flags || area_reg->rq_area_token) + if (area_reg->flags & ~IO_ZCRX_AREA_SUPPORTED_FLAGS) return -EINVAL; - if (area_reg->__resv1 || area_reg->__resv2[0] || area_reg->__resv2[1]) + if (area_reg->rq_area_token) return -EINVAL; - if (area_reg->addr & ~PAGE_MASK || area_reg->len & ~PAGE_MASK) + if (area_reg->__resv2[0] || area_reg->__resv2[1]) return -EINVAL; - iov.iov_base = u64_to_user_ptr(area_reg->addr); - iov.iov_len = area_reg->len; - ret = io_buffer_validate(&iov); - if (ret) - return ret; - ret = -ENOMEM; area = kzalloc(sizeof(*area), GFP_KERNEL); if (!area) goto err; - area->pages = io_pin_pages((unsigned long)area_reg->addr, area_reg->len, - &nr_pages); - if (IS_ERR(area->pages)) { - ret = PTR_ERR(area->pages); - area->pages = NULL; + ret = io_import_area(ifq, &area->mem, area_reg); + if (ret) goto err; - } - area->nia.num_niovs = nr_pages; - area->nia.niovs = kvmalloc_array(nr_pages, sizeof(area->nia.niovs[0]), + nr_iovs = area->mem.size >> PAGE_SHIFT; + area->nia.num_niovs = nr_iovs; + + ret = -ENOMEM; + area->nia.niovs = kvmalloc_array(nr_iovs, sizeof(area->nia.niovs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->nia.niovs) goto err; - area->freelist = kvmalloc_array(nr_pages, sizeof(area->freelist[0]), + area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), GFP_KERNEL | __GFP_ZERO); if (!area->freelist) goto err; - for (i = 0; i < nr_pages; i++) - area->freelist[i] = i; - - area->user_refs = kvmalloc_array(nr_pages, sizeof(area->user_refs[0]), + area->user_refs = kvmalloc_array(nr_iovs, sizeof(area->user_refs[0]), GFP_KERNEL | __GFP_ZERO); if (!area->user_refs) goto err; - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < nr_iovs; i++) { struct net_iov *niov = &area->nia.niovs[i]; niov->owner = &area->nia; area->freelist[i] = i; atomic_set(&area->user_refs[i], 0); + niov->type = NET_IOV_IOURING; } - area->free_count = nr_pages; + area->free_count = nr_iovs; area->ifq = ifq; /* we're only supporting one area per ifq for now */ area->area_id = 0; @@ -274,6 +455,7 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) ifq->ctx = ctx; spin_lock_init(&ifq->lock); spin_lock_init(&ifq->rq_lock); + mutex_init(&ifq->dma_lock); return ifq; } @@ -323,9 +505,20 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) put_device(ifq->dev); io_free_rbuf_ring(ifq); + mutex_destroy(&ifq->dma_lock); kfree(ifq); } +struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, + unsigned int id) +{ + struct io_zcrx_ifq *ifq = xa_load(&ctx->zcrx_ctxs, id); + + lockdep_assert_held(&ctx->mmap_lock); + + return ifq ? &ifq->region : NULL; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -335,6 +528,7 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_region_desc rd; struct io_zcrx_ifq *ifq; int ret; + u32 id; /* * 1. Interface queue allocation. @@ -347,13 +541,12 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN && ctx->flags & IORING_SETUP_CQE32)) return -EINVAL; - if (ctx->ifq) - return -EBUSY; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) return -EFAULT; - if (memchr_inv(®.__resv, 0, sizeof(reg.__resv))) + if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)) || + reg.__resv2 || reg.zcrx_id) return -EINVAL; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; @@ -370,30 +563,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + ifq->rq_entries = reg.rq_entries; - ret = io_allocate_rbuf_ring(ifq, ®, &rd); - if (ret) - goto err; + scoped_guard(mutex, &ctx->mmap_lock) { + /* preallocate id */ + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto ifq_free; + } - ret = io_zcrx_create_area(ifq, &ifq->area, &area); + ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); if (ret) goto err; - ifq->rq_entries = reg.rq_entries; - - ret = -ENODEV; ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, &ifq->netdev_tracker, GFP_KERNEL); - if (!ifq->netdev) + if (!ifq->netdev) { + ret = -ENODEV; goto err; + } ifq->dev = ifq->netdev->dev.parent; - ret = -EOPNOTSUPP; - if (!ifq->dev) + if (!ifq->dev) { + ret = -EOPNOTSUPP; goto err; + } get_device(ifq->dev); - ret = io_zcrx_map_area(ifq, ifq->area); + ret = io_zcrx_create_area(ifq, &ifq->area, &area); if (ret) goto err; @@ -407,6 +604,14 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, reg.offsets.rqes = sizeof(struct io_uring); reg.offsets.head = offsetof(struct io_uring, head); reg.offsets.tail = offsetof(struct io_uring, tail); + reg.zcrx_id = id; + + scoped_guard(mutex, &ctx->mmap_lock) { + /* publish ifq */ + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err; + } if (copy_to_user(arg, ®, sizeof(reg)) || copy_to_user(u64_to_user_ptr(reg.region_ptr), &rd, sizeof(rd)) || @@ -414,24 +619,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ret = -EFAULT; goto err; } - ctx->ifq = ifq; return 0; err: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +ifq_free: io_zcrx_ifq_free(ifq); return ret; } void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { - struct io_zcrx_ifq *ifq = ctx->ifq; + struct io_zcrx_ifq *ifq; + unsigned long id; lockdep_assert_held(&ctx->uring_lock); - if (!ifq) - return; + while (1) { + scoped_guard(mutex, &ctx->mmap_lock) { + ifq = xa_find(&ctx->zcrx_ctxs, &id, ULONG_MAX, XA_PRESENT); + if (ifq) + xa_erase(&ctx->zcrx_ctxs, id); + } + if (!ifq) + break; + io_zcrx_ifq_free(ifq); + } - ctx->ifq = NULL; - io_zcrx_ifq_free(ifq); + xa_destroy(&ctx->zcrx_ctxs); } static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) @@ -488,12 +703,15 @@ static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) { + struct io_zcrx_ifq *ifq; + unsigned long index; + lockdep_assert_held(&ctx->uring_lock); - if (!ctx->ifq) - return; - io_zcrx_scrub(ctx->ifq); - io_close_queue(ctx->ifq); + xa_for_each(&ctx->zcrx_ctxs, index, ifq) { + io_zcrx_scrub(ifq); + io_close_queue(ifq); + } } static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) @@ -584,7 +802,7 @@ static void io_zcrx_refill_slow(struct page_pool *pp, struct io_zcrx_ifq *ifq) static netmem_ref io_pp_zc_alloc_netmems(struct page_pool *pp, gfp_t gfp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); /* pp should already be ensuring that */ if (unlikely(pp->alloc.count)) @@ -616,7 +834,8 @@ static bool io_pp_zc_release_netmem(struct page_pool *pp, netmem_ref netmem) static int io_pp_zc_init(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); + int ret; if (WARN_ON_ONCE(!ifq)) return -EINVAL; @@ -629,13 +848,17 @@ static int io_pp_zc_init(struct page_pool *pp) if (pp->p.dma_dir != DMA_FROM_DEVICE) return -EOPNOTSUPP; + ret = io_zcrx_map_area(ifq, ifq->area); + if (ret) + return ret; + percpu_ref_get(&ifq->ctx->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = pp->mp_priv; + struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); struct io_zcrx_area *area = ifq->area; if (WARN_ON_ONCE(area->free_count != area->nia.num_niovs)) @@ -664,6 +887,9 @@ static void io_pp_uninstall(void *mp_priv, struct netdev_rx_queue *rxq) struct io_zcrx_ifq *ifq = mp_priv; io_zcrx_drop_netdev(ifq); + if (ifq->area) + io_zcrx_unmap_area(ifq, ifq->area); + p->mp_ops = NULL; p->mp_priv = NULL; } @@ -722,6 +948,9 @@ static ssize_t io_zcrx_copy_chunk(struct io_kiocb *req, struct io_zcrx_ifq *ifq, size_t copied = 0; int ret = 0; + if (area->mem.is_dmabuf) + return -EFAULT; + while (len) { size_t copy_size = min_t(size_t, PAGE_SIZE, len); const int dst_off = 0; @@ -789,8 +1018,8 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (niov->pp->mp_ops != &io_uring_pp_zc_ops || - niov->pp->mp_priv != ifq) + if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || + io_pp_to_ifq(niov->pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) |