From 943d0609d0571af092dc13456cbca70351e4d20e Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:22 +0000 Subject: io_uring: rename ->resize_lock ->resize_lock is used for resizing rings, but it's a good idea to reuse it in other cases as well. Rename it into mmap_lock as it's protects from races with mmap. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/68f705306f3ac4d2fb999eb80ea1615015ce9f7f.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fd4cdb0860a2..fafc1d779eb1 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -424,7 +424,7 @@ struct io_ring_ctx { * side will need to grab this lock, to prevent either side from * being run concurrently with the other. */ - struct mutex resize_lock; + struct mutex mmap_lock; /* * If IORING_SETUP_NO_MMAP is used, then the below holds -- cgit v1.2.3 From a730d2047d4ef822262c37f51ff7267f2f0e7167 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:24 +0000 Subject: io_uring/memmap: flag vmap'ed regions Add internal flags for struct io_mapped_region. The first flag we need is IO_REGION_F_VMAPPED, that indicates that the pointer has to be unmapped on region destruction. For now all regions are vmap'ed, so it's set unconditionally. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/5a3d8046a038da97c0f8a8c8f1733fa3fc689d31.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 +++-- io_uring/memmap.c | 14 ++++++++++---- io_uring/memmap.h | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index fafc1d779eb1..0793a91b66a5 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -78,8 +78,9 @@ struct io_hash_table { struct io_mapped_region { struct page **pages; - void *vmap_ptr; - size_t nr_pages; + void *ptr; + unsigned nr_pages; + unsigned flags; }; /* diff --git a/io_uring/memmap.c b/io_uring/memmap.c index a0d4151d11af..31fb8c8ffe4e 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -202,14 +202,19 @@ void *__io_uaddr_map(struct page ***pages, unsigned short *npages, return ERR_PTR(-ENOMEM); } +enum { + /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ + IO_REGION_F_VMAP = 1, +}; + void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { unpin_user_pages(mr->pages, mr->nr_pages); kvfree(mr->pages); } - if (mr->vmap_ptr) - vunmap(mr->vmap_ptr); + if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) + vunmap(mr->ptr); if (mr->nr_pages && ctx->user) __io_unaccount_mem(ctx->user, mr->nr_pages); @@ -225,7 +230,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, void *vptr; u64 end; - if (WARN_ON_ONCE(mr->pages || mr->vmap_ptr || mr->nr_pages)) + if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) return -EFAULT; if (memchr_inv(®->__resv, 0, sizeof(reg->__resv))) return -EINVAL; @@ -260,8 +265,9 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, } mr->pages = pages; - mr->vmap_ptr = vptr; + mr->ptr = vptr; mr->nr_pages = nr_pages; + mr->flags |= IO_REGION_F_VMAP; return 0; out_free: if (pages_accounted) diff --git a/io_uring/memmap.h b/io_uring/memmap.h index f361a635b6c7..2096a8427277 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -28,7 +28,7 @@ int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, static inline void *io_region_get_ptr(struct io_mapped_region *mr) { - return mr->vmap_ptr; + return mr->ptr; } static inline bool io_region_is_set(struct io_mapped_region *mr) -- cgit v1.2.3 From 8078486e1d53591ed946c943177339e59e3089e0 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:34 +0000 Subject: io_uring: use region api for SQ Convert internal parts of the SQ managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/1fb73ced6b835cb319ab0fe1dc0b2e982a9a5650.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +-- io_uring/io_uring.c | 36 ++++++++++++++---------------------- io_uring/memmap.c | 3 +-- io_uring/register.c | 35 ++++++++++++++++------------------- 4 files changed, 32 insertions(+), 45 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 0793a91b66a5..808accf1776b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -432,10 +432,9 @@ struct io_ring_ctx { * the gup'ed pages for the two rings, and the sqes. */ unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; + struct io_mapped_region sq_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 539fecde0244..bdd4bc984bc8 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2641,29 +2641,19 @@ static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, size); } -static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->sqe_pages, &ctx->n_sqe_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, true); - io_pages_unmap(ctx->sq_sqes, &ctx->sqe_pages, &ctx->n_sqe_pages, - true); } else { io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); ctx->n_ring_pages = 0; - io_pages_free(&ctx->sqe_pages, ctx->n_sqe_pages); - ctx->n_sqe_pages = 0; vunmap(ctx->rings); - vunmap(ctx->sq_sqes); } + io_free_region(ctx, &ctx->sq_region); + ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3481,9 +3471,10 @@ bool io_is_uring_fops(struct file *file) static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, struct io_uring_params *p) { + struct io_uring_region_desc rd; struct io_rings *rings; size_t size, sq_array_offset; - void *ptr; + int ret; /* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3519,17 +3510,18 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, return -EOVERFLOW; } - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&ctx->sqe_pages, &ctx->n_sqe_pages, size); - else - ptr = io_sqes_map(ctx, p->sq_off.user_addr, size); - - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_rings_free(ctx); - return PTR_ERR(ptr); + return ret; } - - ctx->sq_sqes = ptr; + ctx->sq_sqes = io_region_get_ptr(&ctx->sq_region); return 0; } diff --git a/io_uring/memmap.c b/io_uring/memmap.c index 9a182c8a4be1..b9aaa25182a5 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -474,8 +474,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); case IORING_OFF_SQES: - return io_uring_mmap_pages(ctx, vma, ctx->sqe_pages, - ctx->n_sqe_pages); + return io_region_mmap(ctx, &ctx->sq_region, vma); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: diff --git a/io_uring/register.c b/io_uring/register.c index 43dd6688a464..1cf47ad40dc1 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -368,11 +368,11 @@ static int io_register_clock(struct io_ring_ctx *ctx, */ struct io_ring_ctx_rings { unsigned short n_ring_pages; - unsigned short n_sqe_pages; struct page **ring_pages; - struct page **sqe_pages; - struct io_uring_sqe *sq_sqes; struct io_rings *rings; + + struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, @@ -382,14 +382,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, if (!(p->flags & IORING_SETUP_NO_MMAP)) { io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, true); - io_pages_unmap(r->sq_sqes, &r->sqe_pages, &r->n_sqe_pages, - true); } else { io_pages_free(&r->ring_pages, r->n_ring_pages); - io_pages_free(&r->sqe_pages, r->n_sqe_pages); vunmap(r->rings); - vunmap(r->sq_sqes); } + io_free_region(ctx, &r->sq_region); } #define swap_old(ctx, o, n, field) \ @@ -404,11 +401,11 @@ static void io_register_free_rings(struct io_ring_ctx *ctx, static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) { + struct io_uring_region_desc rd; struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL; size_t size, sq_array_offset; struct io_uring_params p; unsigned i, tail; - void *ptr; int ret; /* for single issuer, must be owner resizing */ @@ -469,16 +466,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) return -EOVERFLOW; } - if (!(p.flags & IORING_SETUP_NO_MMAP)) - ptr = io_pages_map(&n.sqe_pages, &n.n_sqe_pages, size); - else - ptr = __io_uaddr_map(&n.sqe_pages, &n.n_sqe_pages, - p.sq_off.user_addr, - size); - if (IS_ERR(ptr)) { + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.sq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES); + if (ret) { io_register_free_rings(ctx, &p, &n); - return PTR_ERR(ptr); + return ret; } + n.sq_sqes = io_region_get_ptr(&n.sq_region); /* * If using SQPOLL, park the thread @@ -509,7 +508,6 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) * Now copy SQ and CQ entries, if any. If either of the destination * rings can't hold what is already there, then fail the operation. */ - n.sq_sqes = ptr; tail = o.rings->sq.tail; if (tail - o.rings->sq.head > p.sq_entries) goto overflow; @@ -558,9 +556,8 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, n_sqe_pages); swap_old(ctx, o, n, ring_pages); - swap_old(ctx, o, n, sqe_pages); + swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; out: -- cgit v1.2.3 From 81a4058e0cd0f07139f088fbeb65bc488f687829 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:35 +0000 Subject: io_uring: use region api for CQ Convert internal parts of the CQ/SQ array managment to the region API. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/46fc3c801290d6b1ac16023d78f6b8e685c87fd6.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 8 +----- io_uring/io_uring.c | 36 +++++++++------------------ io_uring/memmap.c | 55 ++++++------------------------------------ io_uring/memmap.h | 4 --- io_uring/register.c | 35 ++++++++++++--------------- 5 files changed, 36 insertions(+), 102 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 808accf1776b..b63f44220e8b 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -427,14 +427,8 @@ struct io_ring_ctx { */ struct mutex mmap_lock; - /* - * If IORING_SETUP_NO_MMAP is used, then the below holds - * the gup'ed pages for the two rings, and the sqes. - */ - unsigned short n_ring_pages; - struct page **ring_pages; - struct io_mapped_region sq_region; + struct io_mapped_region ring_region; /* used for optimised request parameter and wait argument passing */ struct io_mapped_region param_region; }; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index bdd4bc984bc8..8dd3fa9be993 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2634,26 +2634,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags, return READ_ONCE(rings->cq.head) == READ_ONCE(rings->cq.tail) ? ret : 0; } -static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr, - size_t size) -{ - return __io_uaddr_map(&ctx->ring_pages, &ctx->n_ring_pages, uaddr, - size); -} - static void io_rings_free(struct io_ring_ctx *ctx) { - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(ctx->rings, &ctx->ring_pages, &ctx->n_ring_pages, - true); - } else { - io_pages_free(&ctx->ring_pages, ctx->n_ring_pages); - ctx->n_ring_pages = 0; - vunmap(ctx->rings); - } - io_free_region(ctx, &ctx->sq_region); - + io_free_region(ctx, &ctx->ring_region); ctx->rings = NULL; ctx->sq_sqes = NULL; } @@ -3485,15 +3469,17 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, if (size == SIZE_MAX) return -EOVERFLOW; - if (!(ctx->flags & IORING_SETUP_NO_MMAP)) - rings = io_pages_map(&ctx->ring_pages, &ctx->n_ring_pages, size); - else - rings = io_rings_map(ctx, p->cq_off.user_addr, size); - - if (IS_ERR(rings)) - return PTR_ERR(rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (ctx->flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p->cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region(ctx, &ctx->ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) + return ret; + ctx->rings = rings = io_region_get_ptr(&ctx->ring_region); - ctx->rings = rings; if (!(ctx->flags & IORING_SETUP_NO_SQARRAY)) ctx->sq_array = (u32 *)((char *)rings + sq_array_offset); rings->sq_ring_mask = p->sq_entries - 1; diff --git a/io_uring/memmap.c b/io_uring/memmap.c index b9aaa25182a5..668b1c3579a2 100644 --- a/io_uring/memmap.c +++ b/io_uring/memmap.c @@ -120,18 +120,6 @@ void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, *npages = 0; } -void io_pages_free(struct page ***pages, int npages) -{ - struct page **page_array = *pages; - - if (!page_array) - return; - - unpin_user_pages(page_array, npages); - kvfree(page_array); - *pages = NULL; -} - struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; @@ -174,34 +162,6 @@ struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) return ERR_PTR(ret); } -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size) -{ - struct page **page_array; - unsigned int nr_pages; - void *page_addr; - - *npages = 0; - - if (uaddr & (PAGE_SIZE - 1) || !size) - return ERR_PTR(-EINVAL); - - nr_pages = 0; - page_array = io_pin_pages(uaddr, size, &nr_pages); - if (IS_ERR(page_array)) - return page_array; - - page_addr = vmap(page_array, nr_pages, VM_MAP, PAGE_KERNEL); - if (page_addr) { - *pages = page_array; - *npages = nr_pages; - return page_addr; - } - - io_pages_free(&page_array, nr_pages); - return ERR_PTR(-ENOMEM); -} - enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, @@ -446,9 +406,10 @@ int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, static int io_region_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + unsigned max_pages) { - unsigned long nr_pages = mr->nr_pages; + unsigned long nr_pages = min(mr->nr_pages, max_pages); vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); @@ -459,7 +420,7 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; - unsigned int npages; + unsigned int page_limit; void *ptr; guard(mutex)(&ctx->mmap_lock); @@ -471,14 +432,14 @@ __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: - npages = min(ctx->n_ring_pages, (sz + PAGE_SIZE - 1) >> PAGE_SHIFT); - return io_uring_mmap_pages(ctx, vma, ctx->ring_pages, npages); + page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; + return io_region_mmap(ctx, &ctx->ring_region, vma, page_limit); case IORING_OFF_SQES: - return io_region_mmap(ctx, &ctx->sq_region, vma); + return io_region_mmap(ctx, &ctx->sq_region, vma, UINT_MAX); case IORING_OFF_PBUF_RING: return io_pbuf_mmap(file, vma); case IORING_MAP_OFF_PARAM_REGION: - return io_region_mmap(ctx, &ctx->param_region, vma); + return io_region_mmap(ctx, &ctx->param_region, vma, UINT_MAX); } return -EINVAL; diff --git a/io_uring/memmap.h b/io_uring/memmap.h index 2402bca3d700..7395996eb353 100644 --- a/io_uring/memmap.h +++ b/io_uring/memmap.h @@ -4,7 +4,6 @@ #define IORING_MAP_OFF_PARAM_REGION 0x20000000ULL struct page **io_pin_pages(unsigned long ubuf, unsigned long len, int *npages); -void io_pages_free(struct page ***pages, int npages); int io_uring_mmap_pages(struct io_ring_ctx *ctx, struct vm_area_struct *vma, struct page **pages, int npages); @@ -13,9 +12,6 @@ void *io_pages_map(struct page ***out_pages, unsigned short *npages, void io_pages_unmap(void *ptr, struct page ***pages, unsigned short *npages, bool put_pages); -void *__io_uaddr_map(struct page ***pages, unsigned short *npages, - unsigned long uaddr, size_t size); - #ifndef CONFIG_MMU unsigned int io_uring_nommu_mmap_capabilities(struct file *file); #endif diff --git a/io_uring/register.c b/io_uring/register.c index 1cf47ad40dc1..5e48413706ac 100644 --- a/io_uring/register.c +++ b/io_uring/register.c @@ -367,26 +367,19 @@ static int io_register_clock(struct io_ring_ctx *ctx, * either mapping or freeing. */ struct io_ring_ctx_rings { - unsigned short n_ring_pages; - struct page **ring_pages; struct io_rings *rings; - struct io_uring_sqe *sq_sqes; + struct io_mapped_region sq_region; + struct io_mapped_region ring_region; }; static void io_register_free_rings(struct io_ring_ctx *ctx, struct io_uring_params *p, struct io_ring_ctx_rings *r) { - if (!(p->flags & IORING_SETUP_NO_MMAP)) { - io_pages_unmap(r->rings, &r->ring_pages, &r->n_ring_pages, - true); - } else { - io_pages_free(&r->ring_pages, r->n_ring_pages); - vunmap(r->rings); - } io_free_region(ctx, &r->sq_region); + io_free_region(ctx, &r->ring_region); } #define swap_old(ctx, o, n, field) \ @@ -439,13 +432,18 @@ static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg) if (size == SIZE_MAX) return -EOVERFLOW; - if (!(p.flags & IORING_SETUP_NO_MMAP)) - n.rings = io_pages_map(&n.ring_pages, &n.n_ring_pages, size); - else - n.rings = __io_uaddr_map(&n.ring_pages, &n.n_ring_pages, - p.cq_off.user_addr, size); - if (IS_ERR(n.rings)) - return PTR_ERR(n.rings); + memset(&rd, 0, sizeof(rd)); + rd.size = PAGE_ALIGN(size); + if (p.flags & IORING_SETUP_NO_MMAP) { + rd.user_addr = p.cq_off.user_addr; + rd.flags |= IORING_MEM_REGION_TYPE_USER; + } + ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING); + if (ret) { + io_register_free_rings(ctx, &p, &n); + return ret; + } + n.rings = io_region_get_ptr(&n.ring_region); n.rings->sq_ring_mask = p.sq_entries - 1; n.rings->cq_ring_mask = p.cq_entries - 1; @@ -555,8 +553,7 @@ overflow: ctx->rings = n.rings; ctx->sq_sqes = n.sq_sqes; - swap_old(ctx, o, n, n_ring_pages); - swap_old(ctx, o, n, ring_pages); + swap_old(ctx, o, n, ring_region); swap_old(ctx, o, n, sq_region); to_free = &o; ret = 0; -- cgit v1.2.3 From 78fda3d056417ccb9921663383b12f771aa0dd43 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Fri, 29 Nov 2024 13:34:36 +0000 Subject: io_uring/kbuf: use mmap_lock to sync with mmap A preparation / cleanup patch simplifying the buf ring - mmap synchronisation. Instead of relying on RCU, which is trickier, do it by grabbing the mmap_lock when when anyone tries to publish or remove a registered buffer to / from ->io_bl_xa. Modifications of the xarray should always be protected by both ->uring_lock and ->mmap_lock, while lookups should hold either of them. While a struct io_buffer_list is in the xarray, the mmap related fields like ->flags and ->buf_pages should stay stable. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/af13bde56ee1a26bcaefaa9aad37a9ea318a590e.1732886067.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 5 ++++ io_uring/kbuf.c | 56 ++++++++++++++++++------------------------ io_uring/kbuf.h | 1 - 3 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index b63f44220e8b..73575d545d3c 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -294,6 +294,11 @@ struct io_ring_ctx { struct io_submit_state submit_state; + /* + * Modifications are protected by ->uring_lock and ->mmap_lock. + * The flags, buf_pages and buf_nr_pages fields should be stable + * once published. + */ struct xarray io_bl_xa; struct io_hash_table cancel_table; diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index d407576ddfb7..662e928cc3b0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -45,10 +45,11 @@ static int io_buffer_add_list(struct io_ring_ctx *ctx, /* * Store buffer group ID and finally mark the list as visible. * The normal lookup doesn't care about the visibility as we're - * always under the ->uring_lock, but the RCU lookup from mmap does. + * always under the ->uring_lock, but lookups from mmap do. */ bl->bgid = bgid; atomic_set(&bl->refs, 1); + guard(mutex)(&ctx->mmap_lock); return xa_err(xa_store(&ctx->io_bl_xa, bgid, bl, GFP_KERNEL)); } @@ -388,7 +389,7 @@ void io_put_bl(struct io_ring_ctx *ctx, struct io_buffer_list *bl) { if (atomic_dec_and_test(&bl->refs)) { __io_remove_buffers(ctx, bl, -1U); - kfree_rcu(bl, rcu); + kfree(bl); } } @@ -397,10 +398,17 @@ void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer_list *bl; struct list_head *item, *tmp; struct io_buffer *buf; - unsigned long index; - xa_for_each(&ctx->io_bl_xa, index, bl) { - xa_erase(&ctx->io_bl_xa, bl->bgid); + while (1) { + unsigned long index = 0; + + scoped_guard(mutex, &ctx->mmap_lock) { + bl = xa_find(&ctx->io_bl_xa, &index, ULONG_MAX, XA_PRESENT); + if (bl) + xa_erase(&ctx->io_bl_xa, bl->bgid); + } + if (!bl) + break; io_put_bl(ctx, bl); } @@ -589,11 +597,7 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) INIT_LIST_HEAD(&bl->buf_list); ret = io_buffer_add_list(ctx, bl, p->bgid); if (ret) { - /* - * Doesn't need rcu free as it was never visible, but - * let's keep it consistent throughout. - */ - kfree_rcu(bl, rcu); + kfree(bl); goto err; } } @@ -736,7 +740,7 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return 0; } - kfree_rcu(free_bl, rcu); + kfree(free_bl); return ret; } @@ -760,7 +764,9 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) if (!(bl->flags & IOBL_BUF_RING)) return -EINVAL; - xa_erase(&ctx->io_bl_xa, bl->bgid); + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->io_bl_xa, bl->bgid); + io_put_bl(ctx, bl); return 0; } @@ -795,29 +801,13 @@ struct io_buffer_list *io_pbuf_get_bl(struct io_ring_ctx *ctx, unsigned long bgid) { struct io_buffer_list *bl; - bool ret; - /* - * We have to be a bit careful here - we're inside mmap and cannot grab - * the uring_lock. This means the buffer_list could be simultaneously - * going away, if someone is trying to be sneaky. Look it up under rcu - * so we know it's not going away, and attempt to grab a reference to - * it. If the ref is already zero, then fail the mapping. If successful, - * the caller will call io_put_bl() to drop the the reference at at the - * end. This may then safely free the buffer_list (and drop the pages) - * at that point, vm_insert_pages() would've already grabbed the - * necessary vma references. - */ - rcu_read_lock(); bl = xa_load(&ctx->io_bl_xa, bgid); /* must be a mmap'able buffer ring and have pages */ - ret = false; - if (bl && bl->flags & IOBL_MMAP) - ret = atomic_inc_not_zero(&bl->refs); - rcu_read_unlock(); - - if (ret) - return bl; + if (bl && bl->flags & IOBL_MMAP) { + if (atomic_inc_not_zero(&bl->refs)) + return bl; + } return ERR_PTR(-EINVAL); } @@ -829,6 +819,8 @@ int io_pbuf_mmap(struct file *file, struct vm_area_struct *vma) struct io_buffer_list *bl; int bgid, ret; + lockdep_assert_held(&ctx->mmap_lock); + bgid = (pgoff & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; bl = io_pbuf_get_bl(ctx, bgid); if (IS_ERR(bl)) diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index 36aadfe5ac00..d5e4afcbfbb3 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -25,7 +25,6 @@ struct io_buffer_list { struct page **buf_pages; struct io_uring_buf_ring *buf_ring; }; - struct rcu_head rcu; }; __u16 bgid; -- cgit v1.2.3 From 5dbb3cbd060aa86a722d7d44278e537ae3f63081 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:31 +0530 Subject: block: define set of integrity flags to be inherited by cloned bip Introduce BIP_CLONE_FLAGS describing integrity flags that should be inherited in the cloned bip from the parent. Suggested-by: Christoph Hellwig Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Reviewed-by: Martin K. Petersen Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-2-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 2 +- include/linux/bio-integrity.h | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 2a4bd6611692..a448a25d13de 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -559,7 +559,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; - bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; return 0; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index dbf0f74c1529..0f0cf10222e8 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -30,6 +30,9 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + #ifdef CONFIG_BLK_DEV_INTEGRITY #define bip_for_each_vec(bvl, bip, iter) \ -- cgit v1.2.3 From fe8f4ca7107e968b0eb7328155c8811f2a19424a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:33 +0530 Subject: block: modify bio_integrity_map_user to accept iov_iter as argument This patch refactors bio_integrity_map_user to accept iov_iter as argument. This is a prep patch. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-4-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 12 +++++------- block/blk-integrity.c | 10 +++++++++- include/linux/bio-integrity.h | 5 ++--- 3 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4341b0d4efa1..f56d01cec689 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -302,16 +302,15 @@ static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, return nr_bvecs; } -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int align = blk_lim_dma_alignment_and_pad(&q->limits); struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + size_t offset, bytes = iter->count; unsigned int direction, nr_bvecs; - struct iov_iter iter; int ret, nr_vecs; - size_t offset; bool copy; if (bio_integrity(bio)) @@ -324,8 +323,7 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) else direction = ITER_SOURCE; - iov_iter_ubuf(&iter, direction, ubuf, bytes); - nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS + 1); if (nr_vecs > BIO_MAX_VECS) return -E2BIG; if (nr_vecs > UIO_FASTIOV) { @@ -335,8 +333,8 @@ int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes) pages = NULL; } - copy = !iov_iter_is_aligned(&iter, align, align); - ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + copy = !iov_iter_is_aligned(iter, align, align); + ret = iov_iter_extract_pages(iter, &pages, bytes, nr_vecs, 0, &offset); if (unlikely(ret < 0)) goto free_bvec; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index b180cac61a9d..4a29754f1bc2 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -115,8 +115,16 @@ EXPORT_SYMBOL(blk_rq_map_integrity_sg); int blk_rq_integrity_map_user(struct request *rq, void __user *ubuf, ssize_t bytes) { - int ret = bio_integrity_map_user(rq->bio, ubuf, bytes); + int ret; + struct iov_iter iter; + unsigned int direction; + if (op_is_write(req_op(rq))) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + iov_iter_ubuf(&iter, direction, ubuf, bytes); + ret = bio_integrity_map_user(rq->bio, &iter); if (ret) return ret; diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 0f0cf10222e8..58ff9988433a 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -75,7 +75,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, unsigned int nr); int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); -int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t len); +int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -101,8 +101,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf, - ssize_t len) +static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 10783d0ba0d7731ec81d88c54f83cf0ff89d1c2a Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:34 +0530 Subject: fs, iov_iter: define meta io descriptor Add flags to describe checks for integrity meta buffer. Also, introduce a new 'uio_meta' structure that upper layer can use to pass the meta/integrity information. Signed-off-by: Kanchan Joshi Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-5-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/uio.h | 9 +++++++++ include/uapi/linux/fs.h | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/uio.h b/include/linux/uio.h index 853f9de5aa05..8ada84e85447 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -82,6 +82,15 @@ struct iov_iter { }; }; +typedef __u16 uio_meta_flags_t; + +struct uio_meta { + uio_meta_flags_t flags; + u16 app_tag; + u64 seed; + struct iov_iter iter; +}; + static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 753971770733..9070ef19f0a3 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -40,6 +40,15 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1< Date: Thu, 28 Nov 2024 16:52:35 +0530 Subject: fs: introduce IOCB_HAS_METADATA for metadata Introduce an IOCB_HAS_METADATA flag for the kiocb struct, for handling requests containing meta payload. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-6-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 7e29433c5ecc..2cc3d45da7b0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -348,6 +348,7 @@ struct readahead_control; #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) +#define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ -- cgit v1.2.3 From 2c0487d8b1f1351d48a13b77b254a2bb6de49eb3 Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:37 +0530 Subject: block: introduce BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags This patch introduces BIP_CHECK_GUARD/REFTAG/APPTAG bip_flags which indicate how the hardware should check the integrity payload. BIP_CHECK_GUARD/REFTAG are conversion of existing semantics, while BIP_CHECK_APPTAG is a new flag. The driver can now just rely on block layer flags, and doesn't need to know the integrity source. Submitter of PI decides which tags to check. This would also give us a unified interface for user and kernel generated integrity. Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-8-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 5 +++++ drivers/nvme/host/core.c | 11 +++-------- include/linux/bio-integrity.h | 6 +++++- 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f56d01cec689..3bee43b87001 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -434,6 +434,11 @@ bool bio_integrity_prep(struct bio *bio) if (bi->csum_type == BLK_INTEGRITY_CSUM_IP) bip->bip_flags |= BIP_IP_CHECKSUM; + /* describe what tags to check in payload */ + if (bi->csum_type) + bip->bip_flags |= BIP_CHECK_GUARD; + if (bi->flags & BLK_INTEGRITY_REF_TAG) + bip->bip_flags |= BIP_CHECK_REFTAG; if (bio_integrity_add_page(bio, virt_to_page(buf), len, offset_in_page(buf)) < len) { printk(KERN_ERR "could not attach integrity payload\n"); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a970168a3014..766df130cd82 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1017,18 +1017,13 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, control |= NVME_RW_PRINFO_PRACT; } - switch (ns->head->pi_type) { - case NVME_NS_DPS_PI_TYPE3: + if (bio_integrity_flagged(req->bio, BIP_CHECK_GUARD)) control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; + if (bio_integrity_flagged(req->bio, BIP_CHECK_REFTAG)) { + control |= NVME_RW_PRINFO_PRCHK_REF; if (op == nvme_cmd_zone_append) control |= NVME_RW_APPEND_PIREMAP; nvme_set_ref_tag(ns, cmnd, req); - break; } } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 58ff9988433a..fe2bfe122db2 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -11,6 +11,9 @@ enum bip_flags { BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 6, /* guard check */ + BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ }; struct bio_integrity_payload { @@ -31,7 +34,8 @@ struct bio_integrity_payload { }; #define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM) + BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ + BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 18623503a3a514780214850bf8ba8b03ea0f3a4b Mon Sep 17 00:00:00 2001 From: Anuj Gupta Date: Thu, 28 Nov 2024 16:52:39 +0530 Subject: scsi: add support for user-meta interface Add support for sending user-meta buffer. Set tags to be checked using flags specified by user/block-layer. With this change, BIP_CTRL_NOCHECK becomes unused. Remove it. Signed-off-by: Anuj Gupta Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20241128112240.8867-10-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- drivers/scsi/sd.c | 4 ++-- include/linux/bio-integrity.h | 16 +++++++--------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8947dab132d7..cb7ac8736b91 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -814,14 +814,14 @@ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_GUARD)) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; - if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) + if (bio_integrity_flagged(bio, BIP_CHECK_REFTAG)) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index fe2bfe122db2..2195bc06dcde 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -7,13 +7,12 @@ enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ - BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ - BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ - BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ - BIP_COPY_USER = 1 << 5, /* Kernel bounce buffer in use */ - BIP_CHECK_GUARD = 1 << 6, /* guard check */ - BIP_CHECK_REFTAG = 1 << 7, /* reftag check */ - BIP_CHECK_APPTAG = 1 << 8, /* apptag check */ + BIP_DISK_NOCHECK = 1 << 2, /* disable disk integrity checking */ + BIP_IP_CHECKSUM = 1 << 3, /* IP checksum */ + BIP_COPY_USER = 1 << 4, /* Kernel bounce buffer in use */ + BIP_CHECK_GUARD = 1 << 5, /* guard check */ + BIP_CHECK_REFTAG = 1 << 6, /* reftag check */ + BIP_CHECK_APPTAG = 1 << 7, /* apptag check */ }; struct bio_integrity_payload { @@ -33,8 +32,7 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; -#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_CTRL_NOCHECK | \ - BIP_DISK_NOCHECK | BIP_IP_CHECKSUM | \ +#define BIP_CLONE_FLAGS (BIP_MAPPED_INTEGRITY | BIP_IP_CHECKSUM | \ BIP_CHECK_GUARD | BIP_CHECK_REFTAG | BIP_CHECK_APPTAG) #ifdef CONFIG_BLK_DEV_INTEGRITY -- cgit v1.2.3 From 3d8b5a22d40435b4a7e58f06ae2cd3506b222898 Mon Sep 17 00:00:00 2001 From: Kanchan Joshi Date: Thu, 28 Nov 2024 16:52:40 +0530 Subject: block: add support to pass user meta buffer If an iocb contains metadata, extract that and prepare the bip. Based on flags specified by the user, set corresponding guard/app/ref tags to be checked in bip. Reviewed-by: Christoph Hellwig Signed-off-by: Anuj Gupta Signed-off-by: Kanchan Joshi Reviewed-by: Keith Busch Link: https://lore.kernel.org/r/20241128112240.8867-11-anuj20.g@samsung.com Signed-off-by: Jens Axboe --- block/bio-integrity.c | 50 +++++++++++++++++++++++++++++++++++++++++++ block/fops.c | 45 +++++++++++++++++++++++++++++--------- include/linux/bio-integrity.h | 7 ++++++ 3 files changed, 92 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 3bee43b87001..5d81ad9a3d20 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -364,6 +364,55 @@ free_bvec: return ret; } +static void bio_uio_meta_to_bip(struct bio *bio, struct uio_meta *meta) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (meta->flags & IO_INTEGRITY_CHK_GUARD) + bip->bip_flags |= BIP_CHECK_GUARD; + if (meta->flags & IO_INTEGRITY_CHK_APPTAG) + bip->bip_flags |= BIP_CHECK_APPTAG; + if (meta->flags & IO_INTEGRITY_CHK_REFTAG) + bip->bip_flags |= BIP_CHECK_REFTAG; + + bip->app_tag = meta->app_tag; +} + +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + unsigned int integrity_bytes; + int ret; + struct iov_iter it; + + if (!bi) + return -EINVAL; + /* + * original meta iterator can be bigger. + * process integrity info corresponding to current data buffer only. + */ + it = meta->iter; + integrity_bytes = bio_integrity_bytes(bi, bio_sectors(bio)); + if (it.count < integrity_bytes) + return -EINVAL; + + /* should fit into two bytes */ + BUILD_BUG_ON(IO_INTEGRITY_VALID_FLAGS >= (1 << 16)); + + if (meta->flags && (meta->flags & ~IO_INTEGRITY_VALID_FLAGS)) + return -EINVAL; + + it.count = integrity_bytes; + ret = bio_integrity_map_user(bio, &it); + if (!ret) { + bio_uio_meta_to_bip(bio, meta); + bip_set_seed(bio_integrity(bio), meta->seed); + iov_iter_advance(&meta->iter, integrity_bytes); + meta->seed += bio_integrity_intervals(bi, bio_sectors(bio)); + } + return ret; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -564,6 +613,7 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vec = bip_src->bip_vec; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & BIP_CLONE_FLAGS; + bip->app_tag = bip_src->app_tag; return 0; } diff --git a/block/fops.c b/block/fops.c index 13a67940d040..6d5c4fc5a216 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,6 +54,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; + WARN_ON_ONCE(iocb->ki_flags & IOCB_HAS_METADATA); if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -124,12 +125,16 @@ static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + bool is_sync = dio->flags & DIO_IS_SYNC; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; + if (!is_sync && (dio->iocb->ki_flags & IOCB_HAS_METADATA)) + bio_integrity_unmap_user(bio); + if (atomic_dec_and_test(&dio->ref)) { - if (!(dio->flags & DIO_IS_SYNC)) { + if (!is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; @@ -221,14 +226,16 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * a retry of this from blocking context. */ if (unlikely(iov_iter_count(iter))) { - bio_release_pages(bio, false); - bio_clear_flag(bio, BIO_REFFED); - bio_put(bio); - blk_finish_plug(&plug); - return -EAGAIN; + ret = -EAGAIN; + goto fail; } bio->bi_opf |= REQ_NOWAIT; } + if (!is_sync && (iocb->ki_flags & IOCB_HAS_METADATA)) { + ret = bio_integrity_map_iter(bio, iocb->private); + if (unlikely(ret)) + goto fail; + } if (is_read) { if (dio->flags & DIO_SHOULD_DIRTY) @@ -269,6 +276,12 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio_put(&dio->bio); return ret; +fail: + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return ret; } static void blkdev_bio_end_io_async(struct bio *bio) @@ -286,6 +299,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) ret = blk_status_to_errno(bio->bi_status); } + if (iocb->ki_flags & IOCB_HAS_METADATA) + bio_integrity_unmap_user(bio); + iocb->ki_complete(iocb, ret); if (dio->flags & DIO_SHOULD_DIRTY) { @@ -330,10 +346,8 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, bio_iov_bvec_set(bio, iter); } else { ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio_put(bio); - return ret; - } + if (unlikely(ret)) + goto out_bio_put; } dio->size = bio->bi_iter.bi_size; @@ -346,6 +360,13 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_HAS_METADATA) { + ret = bio_integrity_map_iter(bio, iocb->private); + WRITE_ONCE(iocb->private, NULL); + if (unlikely(ret)) + goto out_bio_put; + } + if (iocb->ki_flags & IOCB_ATOMIC) bio->bi_opf |= REQ_ATOMIC; @@ -360,6 +381,10 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, submit_bio(bio); } return -EIOCBQUEUED; + +out_bio_put: + bio_put(bio); + return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index 2195bc06dcde..de0a6c9de4d1 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -23,6 +23,7 @@ struct bio_integrity_payload { unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ + u16 app_tag; /* application tag value */ struct bvec_iter bio_iter; /* for rewinding parent bio */ @@ -78,6 +79,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp, int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter); +int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta); void bio_integrity_unmap_user(struct bio *bio); bool bio_integrity_prep(struct bio *bio); void bio_integrity_advance(struct bio *bio, unsigned int bytes_done); @@ -108,6 +110,11 @@ static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) return -EINVAL; } +static inline int bio_integrity_map_iter(struct bio *bio, struct uio_meta *meta) +{ + return -EINVAL; +} + static inline void bio_integrity_unmap_user(struct bio *bio) { } -- cgit v1.2.3 From 546d191427cf5cf3215529744c2ea8558f0279db Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Nov 2024 15:53:58 -0700 Subject: block: make bio_integrity_map_user() static inline If CONFIG_BLK_DEV_INTEGRITY isn't set, then the dummy helper must be static inline to avoid complaints about the function being unused. Fixes: fe8f4ca7107e ("block: modify bio_integrity_map_user to accept iov_iter as argument") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411300229.y7h60mDg-lkp@intel.com/ Signed-off-by: Jens Axboe --- include/linux/bio-integrity.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio-integrity.h b/include/linux/bio-integrity.h index de0a6c9de4d1..802f52e38efd 100644 --- a/include/linux/bio-integrity.h +++ b/include/linux/bio-integrity.h @@ -105,7 +105,7 @@ static inline void bioset_integrity_free(struct bio_set *bs) { } -static int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) +static inline int bio_integrity_map_user(struct bio *bio, struct iov_iter *iter) { return -EINVAL; } -- cgit v1.2.3 From 1143be17d7acb02a7c4dba6169a33534983f4960 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Dec 2024 07:44:25 -0700 Subject: io_uring/rw: don't mask in f_iocb_flags A previous commit changed overwriting kiocb->ki_flags with ->f_iocb_flags with masking it in. This breaks for retry situations, where we don't necessarily want to retain previously set flags, like IOCB_NOWAIT. The use case needs IOCB_HAS_METADATA to be persistent, but the change makes all flags persistent, which is an issue. Add a request flag to track whether the request has metadata or not, as that is persistent across issues. Fixes: 59a7d12a7fb5 ("io_uring: introduce attributes for read/write and PI support") Signed-off-by: Jens Axboe --- include/linux/io_uring_types.h | 3 +++ io_uring/rw.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 73575d545d3c..623d8e798a11 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -480,6 +480,7 @@ enum { REQ_F_BL_NO_RECYCLE_BIT, REQ_F_BUFFERS_COMMIT_BIT, REQ_F_BUF_NODE_BIT, + REQ_F_HAS_METADATA_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -560,6 +561,8 @@ enum { REQ_F_BUFFERS_COMMIT = IO_REQ_FLAG(REQ_F_BUFFERS_COMMIT_BIT), /* buf node is valid */ REQ_F_BUF_NODE = IO_REQ_FLAG(REQ_F_BUF_NODE_BIT), + /* request has read/write metadata assigned */ + REQ_F_HAS_METADATA = IO_REQ_FLAG(REQ_F_HAS_METADATA_BIT), }; typedef void (*io_req_tw_func_t)(struct io_kiocb *req, struct io_tw_state *ts); diff --git a/io_uring/rw.c b/io_uring/rw.c index bdfc3faef85d..d0ac4a51420e 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -283,7 +283,7 @@ static int io_prep_rw_pi(struct io_kiocb *req, struct io_rw *rw, int ddir, pi_attr.len, &io->meta.iter); if (unlikely(ret < 0)) return ret; - rw->kiocb.ki_flags |= IOCB_HAS_METADATA; + req->flags |= REQ_F_HAS_METADATA; io_meta_save_state(io); return ret; } @@ -787,18 +787,17 @@ static bool io_rw_should_retry(struct io_kiocb *req) struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); struct kiocb *kiocb = &rw->kiocb; - /* never retry for NOWAIT, we just complete with -EAGAIN */ - if (req->flags & REQ_F_NOWAIT) + /* + * Never retry for NOWAIT or a request with metadata, we just complete + * with -EAGAIN. + */ + if (req->flags & (REQ_F_NOWAIT | REQ_F_HAS_METADATA)) return false; /* Only for buffered IO */ if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_HIPRI)) return false; - /* never retry for meta io */ - if (kiocb->ki_flags & IOCB_HAS_METADATA) - return false; - /* * just use poll if we can, and don't attempt if the fs doesn't * support callback based unlocks @@ -849,7 +848,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) if (!(req->flags & REQ_F_FIXED_FILE)) req->flags |= io_file_get_flags(file); - kiocb->ki_flags |= file->f_iocb_flags; + kiocb->ki_flags = file->f_iocb_flags; ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type); if (unlikely(ret)) return ret; @@ -883,7 +882,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) kiocb->ki_complete = io_complete_rw; } - if (kiocb->ki_flags & IOCB_HAS_METADATA) { + if (req->flags & REQ_F_HAS_METADATA) { struct io_async_rw *io = req->async_data; /* @@ -892,6 +891,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type) */ if (!(req->file->f_flags & O_DIRECT)) return -EOPNOTSUPP; + kiocb->ki_flags |= IOCB_HAS_METADATA; kiocb->private = &io->meta; } -- cgit v1.2.3