From e71e00127110dedc6a9e746178282b4dac97ed96 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 11:25:36 -0400 Subject: iommupt: Add the RISC-V page table format The RISC-V format is a fairly simple 5 level page table not unlike the x86 one. It has optional support for a single contiguous page size of 64k (16 x 4k). The specification describes a 32-bit format, the general code can support it via a #define but the iommu side implementation has been left off until a user comes. Tested-by: Vincent Chen Acked-by: Paul Walmsley # arch/riscv Reviewed-by: Tomasz Jeznach Tested-by: Tomasz Jeznach Signed-off-by: Jason Gunthorpe Signed-off-by: Joerg Roedel --- include/linux/generic_pt/common.h | 16 ++++++++++++++++ include/linux/generic_pt/iommu.h | 11 +++++++++++ 2 files changed, 27 insertions(+) (limited to 'include') diff --git a/include/linux/generic_pt/common.h b/include/linux/generic_pt/common.h index 6a9a1acb5aad..fc5d0b5edadc 100644 --- a/include/linux/generic_pt/common.h +++ b/include/linux/generic_pt/common.h @@ -175,6 +175,22 @@ enum { PT_FEAT_VTDSS_FORCE_WRITEABLE, }; +struct pt_riscv_32 { + struct pt_common common; +}; + +struct pt_riscv_64 { + struct pt_common common; +}; + +enum { + /* + * Support the 64k contiguous page size following the Svnapot extension. + */ + PT_FEAT_RISCV_SVNAPOT_64K = PT_FEAT_FMT_START, + +}; + struct pt_x86_64 { struct pt_common common; }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..49d9addb98c5 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -275,6 +275,17 @@ struct pt_iommu_vtdss_hw_info { IOMMU_FORMAT(vtdss, vtdss_pt); +struct pt_iommu_riscv_64_cfg { + struct pt_iommu_cfg common; +}; + +struct pt_iommu_riscv_64_hw_info { + u64 ppn; + u8 fsc_iosatp_mode; +}; + +IOMMU_FORMAT(riscv_64, riscv_64pt); + struct pt_iommu_x86_64_cfg { struct pt_iommu_cfg common; /* 4 is a 57 bit 5 level table */ -- cgit v1.2.3 From 99fb8afa16add85ed016baee9735231bca0c32b4 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:10 -0400 Subject: iommupt: Directly call iommupt's unmap_range() The common algorithm in iommupt does not require the iommu_pgsize() calculations, it can directly unmap any arbitrary range. Add a new function pointer to directly call an iommupt unmap_range op and make __iommu_unmap() call it directly. Gives about a 5% gain on single page unmappings. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map/unmap_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 29 ++++------------------------- drivers/iommu/iommu.c | 27 +++++++++++++++++++++------ include/linux/generic_pt/iommu.h | 37 +++++++++++++++++++++++++++++++------ include/linux/iommu.h | 1 + 4 files changed, 57 insertions(+), 37 deletions(-) (limited to 'include') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 9c08bb594e41..a627c26fa62d 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -1031,34 +1031,12 @@ start_oa: return ret; } -/** - * unmap_pages() - Make a range of IOVA empty/not present - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @iotlb_gather: Gather struct that must be flushed on return - * - * unmap_pages() will remove a translation created by map_pages(). It cannot - * subdivide a mapping created by map_pages(), so it should be called with IOVA - * ranges that match those passed to map_pages(). The IOVA range can aggregate - * contiguous map_pages() calls so long as no individual range is split. - * - * Context: The caller must hold a write range lock that includes - * the whole range. - * - * Returns: Number of bytes of VA unmapped. iova + res will be the point - * unmapping stopped. - */ -size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, - size_t pgsize, size_t pgcount, +static size_t NS(unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, struct iommu_iotlb_gather *iotlb_gather) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( unmap.free_list) }; - pt_vaddr_t len = pgsize * pgcount; struct pt_range range; int ret; @@ -1073,7 +1051,6 @@ size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, return unmap.unmapped; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); static void NS(get_info)(struct pt_iommu *iommu_table, struct pt_iommu_info *info) @@ -1121,6 +1098,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) .set_dirty = NS(set_dirty), @@ -1183,6 +1161,7 @@ static int pt_iommu_init_domain(struct pt_iommu *iommu_table, domain->type = __IOMMU_DOMAIN_PAGING; domain->pgsize_bitmap = info.pgsize_bitmap; + domain->is_iommupt = true; if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) range = _pt_top_range(common, diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 35db51780954..f68269707101 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "dma-iommu.h" #include "iommu-priv.h" @@ -2666,13 +2667,12 @@ int iommu_map(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_GPL(iommu_map); -static size_t __iommu_unmap(struct iommu_domain *domain, - unsigned long iova, size_t size, - struct iommu_iotlb_gather *iotlb_gather) +static size_t +__iommu_unmap_domain_pgtbl(struct iommu_domain *domain, unsigned long iova, + size_t size, struct iommu_iotlb_gather *iotlb_gather) { const struct iommu_domain_ops *ops = domain->ops; size_t unmapped_page, unmapped = 0; - unsigned long orig_iova = iova; unsigned int min_pagesz; if (unlikely(!(domain->type & __IOMMU_DOMAIN_PAGING))) @@ -2718,8 +2718,23 @@ static size_t __iommu_unmap(struct iommu_domain *domain, unmapped += unmapped_page; } - trace_unmap(orig_iova, size, unmapped); - iommu_debug_unmap_end(domain, orig_iova, size, unmapped); + return unmapped; +} + +static size_t __iommu_unmap(struct iommu_domain *domain, unsigned long iova, + size_t size, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + size_t unmapped; + + if (pt) + unmapped = pt->ops->unmap_range(pt, iova, size, iotlb_gather); + else + unmapped = __iommu_unmap_domain_pgtbl(domain, iova, size, + iotlb_gather); + trace_unmap(iova, size, unmapped); + iommu_debug_unmap_end(domain, iova, size, unmapped); return unmapped; } diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 49d9addb98c5..0da971134a37 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -66,6 +66,13 @@ struct pt_iommu { struct device *iommu_device; }; +static inline struct pt_iommu *iommupt_from_domain(struct iommu_domain *domain) +{ + if (!IS_ENABLED(CONFIG_IOMMU_PT) || !domain->is_iommupt) + return NULL; + return container_of(domain, struct pt_iommu, domain); +} + /** * struct pt_iommu_info - Details about the IOMMU page table * @@ -80,6 +87,29 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @unmap_range: Make a range of IOVA empty/not present + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @len: Length of the range starting from @iova + * @iotlb_gather: Gather struct that must be flushed on return + * + * unmap_range() will remove a translation created by map_range(). It + * cannot subdivide a mapping created by map_range(), so it should be + * called with IOVA ranges that match those passed to map_pages. The + * IOVA range can aggregate contiguous map_range() calls so long as no + * individual range is split. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: Number of bytes of VA unmapped. iova + res will be the + * point unmapping stopped. + */ + size_t (*unmap_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + dma_addr_t len, + struct iommu_iotlb_gather *iotlb_gather); + /** * @set_dirty: Make the iova write dirty * @iommu_table: Table to manipulate @@ -198,10 +228,6 @@ struct pt_iommu_cfg { unsigned long iova, phys_addr_t paddr, \ size_t pgsize, size_t pgcount, \ int prot, gfp_t gfp, size_t *mapped); \ - size_t pt_iommu_##fmt##_unmap_pages( \ - struct iommu_domain *domain, unsigned long iova, \ - size_t pgsize, size_t pgcount, \ - struct iommu_iotlb_gather *iotlb_gather); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -223,8 +249,7 @@ struct pt_iommu_cfg { */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .map_pages = &pt_iommu_##fmt##_map_pages #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..7ca648c01336 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -223,6 +223,7 @@ enum iommu_domain_cookie_type { struct iommu_domain { unsigned type; enum iommu_domain_cookie_type cookie_type; + bool is_iommupt; const struct iommu_domain_ops *ops; const struct iommu_dirty_ops *dirty_ops; const struct iommu_ops *owner; /* Whose domain_alloc we came from */ -- cgit v1.2.3 From d6c65b0fd6218bd21ed0be7a8d3218e8f6dc91de Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 27 Feb 2026 15:30:11 -0400 Subject: iommupt: Avoid rewalking during map Currently the core code provides a simplified interface to drivers where it fragments a requested multi-page map into single page size steps after doing all the calculations to figure out what page size is appropriate. Each step rewalks the page tables from the start. Since iommupt has a single implementation of the mapping algorithm it can internally compute each step as it goes while retaining its current position in the walk. Add a new function pt_pgsz_count() which computes the same page size fragement of a large mapping operations. Compute the next fragment when all the leaf entries of the current fragement have been written, then continue walking from the current point. The function pointer is run through pt_iommu_ops instead of iommu_domain_ops to discourage using it outside iommupt. All drivers with their own page tables should continue to use the simplified map_pages() style interfaces. Reviewed-by: Samiullah Khawaja Reviewed-by: Kevin Tian Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Signed-off-by: Joerg Roedel --- drivers/iommu/generic_pt/iommu_pt.h | 133 +++++++++++++++++----------- drivers/iommu/generic_pt/kunit_generic_pt.h | 12 +++ drivers/iommu/generic_pt/pt_iter.h | 22 +++++ drivers/iommu/iommu.c | 39 ++++++-- include/linux/generic_pt/iommu.h | 34 +++++-- 5 files changed, 175 insertions(+), 65 deletions(-) (limited to 'include') diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index a627c26fa62d..17b72dbd7d51 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -477,6 +477,7 @@ struct pt_iommu_map_args { pt_oaddr_t oa; unsigned int leaf_pgsize_lg2; unsigned int leaf_level; + pt_vaddr_t num_leaves; }; /* @@ -529,11 +530,15 @@ static int clear_contig(const struct pt_state *start_pts, static int __map_range_leaf(struct pt_range *range, void *arg, unsigned int level, struct pt_table_p *table) { + struct pt_iommu *iommu_table = iommu_from_common(range->common); struct pt_state pts = pt_init(range, level, table); struct pt_iommu_map_args *map = arg; unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; unsigned int start_index; pt_oaddr_t oa = map->oa; + unsigned int num_leaves; + unsigned int orig_end; + pt_vaddr_t last_va; unsigned int step; bool need_contig; int ret = 0; @@ -547,6 +552,15 @@ static int __map_range_leaf(struct pt_range *range, void *arg, _pt_iter_first(&pts); start_index = pts.index; + orig_end = pts.end_index; + if (pts.index + map->num_leaves < pts.end_index) { + /* Need to stop in the middle of the table to change sizes */ + pts.end_index = pts.index + map->num_leaves; + num_leaves = 0; + } else { + num_leaves = map->num_leaves - (pts.end_index - pts.index); + } + do { pts.type = pt_load_entry_raw(&pts); if (pts.type != PT_ENTRY_EMPTY || need_contig) { @@ -572,7 +586,40 @@ static int __map_range_leaf(struct pt_range *range, void *arg, flush_writes_range(&pts, start_index, pts.index); map->oa = oa; - return ret; + map->num_leaves = num_leaves; + if (ret || num_leaves) + return ret; + + /* range->va is not valid if we reached the end of the table */ + pts.index -= step; + pt_index_to_va(&pts); + pts.index += step; + last_va = range->va + log2_to_int(leaf_pgsize_lg2); + + if (last_va - 1 == range->last_va) { + PT_WARN_ON(pts.index != orig_end); + return 0; + } + + /* + * Reached a point where the page size changed, compute the new + * parameters. + */ + map->leaf_pgsize_lg2 = pt_compute_best_pgsize( + iommu_table->domain.pgsize_bitmap, last_va, range->last_va, oa); + map->leaf_level = + pt_pgsz_lg2_to_level(range->common, map->leaf_pgsize_lg2); + map->num_leaves = pt_pgsz_count(iommu_table->domain.pgsize_bitmap, + last_va, range->last_va, oa, + map->leaf_pgsize_lg2); + + /* Didn't finish this table level, caller will repeat it */ + if (pts.index != orig_end) { + if (pts.index != start_index) + pt_index_to_va(&pts); + return -EAGAIN; + } + return 0; } static int __map_range(struct pt_range *range, void *arg, unsigned int level, @@ -595,14 +642,9 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, if (pts.type != PT_ENTRY_EMPTY) return -EADDRINUSE; ret = pt_iommu_new_table(&pts, &map->attrs); - if (ret) { - /* - * Racing with another thread installing a table - */ - if (ret == -EAGAIN) - continue; + /* EAGAIN on a race will loop again */ + if (ret) return ret; - } } else { pts.table_lower = pt_table_ptr(&pts); /* @@ -626,10 +668,12 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, * The already present table can possibly be shared with another * concurrent map. */ - if (map->leaf_level == level - 1) - ret = pt_descend(&pts, arg, __map_range_leaf); - else - ret = pt_descend(&pts, arg, __map_range); + do { + if (map->leaf_level == level - 1) + ret = pt_descend(&pts, arg, __map_range_leaf); + else + ret = pt_descend(&pts, arg, __map_range); + } while (ret == -EAGAIN); if (ret) return ret; @@ -637,6 +681,14 @@ static int __map_range(struct pt_range *range, void *arg, unsigned int level, pt_index_to_va(&pts); if (pts.index >= pts.end_index) break; + + /* + * This level is currently running __map_range_leaf() which is + * not correct if the target level has been updated to this + * level. Have the caller invoke __map_range_leaf. + */ + if (map->leaf_level == level) + return -EAGAIN; } while (true); return 0; } @@ -808,12 +860,13 @@ static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, static int do_map(struct pt_range *range, struct pt_common *common, bool single_page, struct pt_iommu_map_args *map) { + int ret; + /* * The __map_single_page() fast path does not support DMA_INCOHERENT * flushing to keep its .text small. */ if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { - int ret; ret = pt_walk_range(range, __map_single_page, map); if (ret != -EAGAIN) @@ -821,50 +874,25 @@ static int do_map(struct pt_range *range, struct pt_common *common, /* EAGAIN falls through to the full path */ } - if (map->leaf_level == range->top_level) - return pt_walk_range(range, __map_range_leaf, map); - return pt_walk_range(range, __map_range, map); + do { + if (map->leaf_level == range->top_level) + ret = pt_walk_range(range, __map_range_leaf, map); + else + ret = pt_walk_range(range, __map_range, map); + } while (ret == -EAGAIN); + return ret; } -/** - * map_pages() - Install translation for an IOVA range - * @domain: Domain to manipulate - * @iova: IO virtual address to start - * @paddr: Physical/Output address to start - * @pgsize: Length of each page - * @pgcount: Length of the range in pgsize units starting from @iova - * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO - * @gfp: GFP flags for any memory allocations - * @mapped: Total bytes successfully mapped - * - * The range starting at IOVA will have paddr installed into it. The caller - * must specify a valid pgsize and pgcount to segment the range into compatible - * blocks. - * - * On error the caller will probably want to invoke unmap on the range from iova - * up to the amount indicated by @mapped to return the table back to an - * unchanged state. - * - * Context: The caller must hold a write range lock that includes the whole - * range. - * - * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were - * mapped are added to @mapped, @mapped is not zerod first. - */ -int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t pgsize, size_t pgcount, - int prot, gfp_t gfp, size_t *mapped) +static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped) { - struct pt_iommu *iommu_table = - container_of(domain, struct pt_iommu, domain); pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; struct pt_common *common = common_from_iommu(iommu_table); struct iommu_iotlb_gather iotlb_gather; - pt_vaddr_t len = pgsize * pgcount; struct pt_iommu_map_args map = { .iotlb_gather = &iotlb_gather, .oa = paddr, - .leaf_pgsize_lg2 = vaffs(pgsize), }; bool single_page = false; struct pt_range range; @@ -892,13 +920,13 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return ret; /* Calculate target page size and level for the leaves */ - if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && - pgcount == 1) { + if (pt_has_system_page_size(common) && len == PAGE_SIZE) { PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); if (log2_mod(iova | paddr, PAGE_SHIFT)) return -ENXIO; map.leaf_pgsize_lg2 = PAGE_SHIFT; map.leaf_level = 0; + map.num_leaves = 1; single_page = true; } else { map.leaf_pgsize_lg2 = pt_compute_best_pgsize( @@ -907,6 +935,9 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, return -ENXIO; map.leaf_level = pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); + map.num_leaves = pt_pgsz_count(pgsize_bitmap, range.va, + range.last_va, paddr, + map.leaf_pgsize_lg2); } ret = check_map_range(iommu_table, &range, &map); @@ -929,7 +960,6 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, *mapped += map.oa - paddr; return ret; } -EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); struct pt_unmap_args { struct iommu_pages_list free_list; @@ -1098,6 +1128,7 @@ static void NS(deinit)(struct pt_iommu *iommu_table) } static const struct pt_iommu_ops NS(ops) = { + .map_range = NS(map_range), .unmap_range = NS(unmap_range), #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) diff --git a/drivers/iommu/generic_pt/kunit_generic_pt.h b/drivers/iommu/generic_pt/kunit_generic_pt.h index 68278bf15cfe..374e475f591e 100644 --- a/drivers/iommu/generic_pt/kunit_generic_pt.h +++ b/drivers/iommu/generic_pt/kunit_generic_pt.h @@ -312,6 +312,17 @@ static void test_best_pgsize(struct kunit *test) } } +static void test_pgsz_count(struct kunit *test) +{ + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_4K, 0, SZ_1G - 1, 0, ilog2(SZ_4K)), + SZ_1G / SZ_4K); + KUNIT_EXPECT_EQ(test, + pt_pgsz_count(SZ_2M | SZ_4K, SZ_4K, SZ_1G - 1, SZ_4K, + ilog2(SZ_4K)), + (SZ_2M - SZ_4K) / SZ_4K); +} + /* * Check that pt_install_table() and pt_table_pa() match */ @@ -770,6 +781,7 @@ static struct kunit_case generic_pt_test_cases[] = { KUNIT_CASE_FMT(test_init), KUNIT_CASE_FMT(test_bitops), KUNIT_CASE_FMT(test_best_pgsize), + KUNIT_CASE_FMT(test_pgsz_count), KUNIT_CASE_FMT(test_table_ptr), KUNIT_CASE_FMT(test_max_va), KUNIT_CASE_FMT(test_table_radix), diff --git a/drivers/iommu/generic_pt/pt_iter.h b/drivers/iommu/generic_pt/pt_iter.h index c0d8617cce29..3e45dbde6b83 100644 --- a/drivers/iommu/generic_pt/pt_iter.h +++ b/drivers/iommu/generic_pt/pt_iter.h @@ -569,6 +569,28 @@ static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, return pgsz_lg2; } +/* + * Return the number of pgsize_lg2 leaf entries that can be mapped for + * va to oa. This accounts for any requirement to reduce or increase the page + * size across the VA range. + */ +static inline pt_vaddr_t pt_pgsz_count(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, + pt_vaddr_t last_va, pt_oaddr_t oa, + unsigned int pgsize_lg2) +{ + pt_vaddr_t len = last_va - va + 1; + pt_vaddr_t next_pgsizes = log2_set_mod(pgsz_bitmap, 0, pgsize_lg2 + 1); + + if (next_pgsizes) { + unsigned int next_pgsize_lg2 = vaffs(next_pgsizes); + + if (log2_mod(va ^ oa, next_pgsize_lg2) == 0) + len = min(len, log2_set_mod_max(va, next_pgsize_lg2) - + va + 1); + } + return log2_div(len, pgsize_lg2); +} + #define _PT_MAKE_CALL_LEVEL(fn) \ static __always_inline int fn(struct pt_range *range, void *arg, \ unsigned int level, \ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index f68269707101..33cee64686e3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2569,14 +2569,14 @@ out_set_count: return pgsize; } -int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +static int __iommu_map_domain_pgtbl(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot, gfp_t gfp) { const struct iommu_domain_ops *ops = domain->ops; unsigned long orig_iova = iova; unsigned int min_pagesz; size_t orig_size = size; - phys_addr_t orig_paddr = paddr; int ret = 0; might_sleep_if(gfpflags_allow_blocking(gfp)); @@ -2633,12 +2633,9 @@ int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, /* unroll mapping in case something went wrong */ if (ret) { iommu_unmap(domain, orig_iova, orig_size - size); - } else { - trace_map(orig_iova, orig_paddr, orig_size); - iommu_debug_map(domain, orig_paddr, orig_size); + return ret; } - - return ret; + return 0; } int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) @@ -2650,6 +2647,32 @@ int iommu_sync_map(struct iommu_domain *domain, unsigned long iova, size_t size) return ops->iotlb_sync_map(domain, iova, size); } +int iommu_map_nosync(struct iommu_domain *domain, unsigned long iova, + phys_addr_t paddr, size_t size, int prot, gfp_t gfp) +{ + struct pt_iommu *pt = iommupt_from_domain(domain); + int ret; + + if (pt) { + size_t mapped = 0; + + ret = pt->ops->map_range(pt, iova, paddr, size, prot, gfp, + &mapped); + if (ret) { + iommu_unmap(domain, iova, mapped); + return ret; + } + return 0; + } + ret = __iommu_map_domain_pgtbl(domain, iova, paddr, size, prot, gfp); + if (!ret) + return ret; + + trace_map(iova, paddr, size); + iommu_debug_map(domain, paddr, size); + return 0; +} + int iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t paddr, size_t size, int prot, gfp_t gfp) { diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 0da971134a37..dd0edd02a48a 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -87,6 +87,33 @@ struct pt_iommu_info { }; struct pt_iommu_ops { + /** + * @map_range: Install translation for an IOVA range + * @iommu_table: Table to manipulate + * @iova: IO virtual address to start + * @paddr: Physical/Output address to start + * @len: Length of the range starting from @iova + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO + * @gfp: GFP flags for any memory allocations + * + * The range starting at IOVA will have paddr installed into it. The + * rage is automatically segmented into optimally sized table entries, + * and can have any valid alignment. + * + * On error the caller will probably want to invoke unmap on the range + * from iova up to the amount indicated by @mapped to return the table + * back to an unchanged state. + * + * Context: The caller must hold a write range lock that includes + * the whole range. + * + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA + * that were mapped are added to @mapped, @mapped is not zerod first. + */ + int (*map_range)(struct pt_iommu *iommu_table, dma_addr_t iova, + phys_addr_t paddr, dma_addr_t len, unsigned int prot, + gfp_t gfp, size_t *mapped); + /** * @unmap_range: Make a range of IOVA empty/not present * @iommu_table: Table to manipulate @@ -224,10 +251,6 @@ struct pt_iommu_cfg { #define IOMMU_PROTOTYPES(fmt) \ phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ dma_addr_t iova); \ - int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ - unsigned long iova, phys_addr_t paddr, \ - size_t pgsize, size_t pgcount, \ - int prot, gfp_t gfp, size_t *mapped); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -248,8 +271,7 @@ struct pt_iommu_cfg { * iommu_pt */ #define IOMMU_PT_DOMAIN_OPS(fmt) \ - .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ - .map_pages = &pt_iommu_##fmt##_map_pages + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- cgit v1.2.3 From a82efb8747d1b8a7c0a377dc79c2aac204eae788 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:02 +0000 Subject: iommu: Add device ATS supported capability PCIe ATS may be disabled by platform firmware, root complex limitations, or kernel policy even when a device advertises the ATS capability in its PCI configuration space. Add a new IOMMU_CAP_PCI_ATS_SUPPORTED capability to allow IOMMU drivers to report the effective ATS decision for a device. When this capability is true for a device, ATS may be enabled for that device, but it does not imply that ATS is currently enabled. A subsequent patch will extend iommufd to expose the effective ATS status to userspace. Suggested-by: Jason Gunthorpe Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- drivers/iommu/amd/iommu.c | 6 ++++++ drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 +++ drivers/iommu/intel/iommu.c | 2 ++ include/linux/iommu.h | 2 ++ 4 files changed, 13 insertions(+) (limited to 'include') diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 81c4d7733872..f1814fee5182 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2985,6 +2985,12 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return amd_iommu_hd_support(iommu); } + case IOMMU_CAP_PCI_ATS_SUPPORTED: { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); + + return amd_iommu_iotlb_sup && + (dev_data->flags & AMD_IOMMU_DEVICE_FLAG_ATS_SUP); + } default: break; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 4d00d796f078..dec5cac98f7c 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -107,6 +107,7 @@ static const char * const event_class_str[] = { }; static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master); +static bool arm_smmu_ats_supported(struct arm_smmu_master *master); static void parse_driver_options(struct arm_smmu_device *smmu) { @@ -2494,6 +2495,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DIRTY_TRACKING: return arm_smmu_dbm_capable(master->smmu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return arm_smmu_ats_supported(master); default: return false; } diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ef7613b177b9..5dca8e525c73 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3220,6 +3220,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return ecap_sc_support(info->iommu->ecap); case IOMMU_CAP_DIRTY_TRACKING: return ssads_supported(info->iommu); + case IOMMU_CAP_PCI_ATS_SUPPORTED: + return info->ats_supported; default: return false; } diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 7ca648c01336..a904821ed169 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -272,6 +272,8 @@ enum iommu_cap { */ IOMMU_CAP_DEFERRED_FLUSH, IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ + /* ATS is supported and may be enabled for this device */ + IOMMU_CAP_PCI_ATS_SUPPORTED, }; /* These are the possible reserved region types */ -- cgit v1.2.3 From a11661a58c06f7fdfef03a368ef20d05a4ea4ed0 Mon Sep 17 00:00:00 2001 From: Shameer Kolothum Date: Tue, 17 Mar 2026 11:16:03 +0000 Subject: iommufd: Report ATS not supported status via IOMMU_GET_HW_INFO If the IOMMU driver reports that ATS is not supported for a device, set the IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED flag in the returned hardware capabilities. This uses a negative flag for UAPI compatibility. Existing userspace assumes ATS is supported if no flag is present. This also ensures that new userspace works correctly on both old and new kernels, where a zero value implies ATS support. When this flag is set, ATS cannot be used for the device. When it is clear, ATS may be enabled when an appropriate HWPT is attached. Reviewed-by: Samiullah Khawaja Reviewed-by: Jason Gunthorpe Signed-off-by: Shameer Kolothum Signed-off-by: Joerg Roedel --- drivers/iommu/iommufd/device.c | 4 ++++ include/uapi/linux/iommufd.h | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'include') diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 344d620cdecc..92c5d5ef8d00 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -1624,6 +1624,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + /* Report when ATS cannot be used for this device */ + if (!device_iommu_capable(idev->dev, IOMMU_CAP_PCI_ATS_SUPPORTED)) + cmd->out_capabilities |= IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED; + cmd->out_max_pasid_log2 = 0; /* * Currently, all iommu drivers enable PASID in the probe_device() diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 1dafbc552d37..507ee9bcba01 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -695,11 +695,15 @@ enum iommu_hw_info_type { * @IOMMU_HW_CAP_PCI_PASID_PRIV: Privileged Mode Supported, user ignores it * when the struct * iommu_hw_info::out_max_pasid_log2 is zero. + * @IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED: ATS is not supported or cannot be used + * on this device (absence implies ATS + * may be enabled) */ enum iommufd_hw_capabilities { IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, IOMMU_HW_CAP_PCI_PASID_EXEC = 1 << 1, IOMMU_HW_CAP_PCI_PASID_PRIV = 1 << 2, + IOMMU_HW_CAP_PCI_ATS_NOT_SUPPORTED = 1 << 3, }; /** -- cgit v1.2.3 From 9dcef98dbee35b8ae784df04c041efffdd42a69c Mon Sep 17 00:00:00 2001 From: Nicolin Chen Date: Thu, 12 Mar 2026 17:36:35 -0700 Subject: iommu/tegra241-cmdqv: Update uAPI to clarify HYP_OWN requirement >From hardware implementation perspective, a guest tegra241-cmdqv hardware is different than the host hardware: - Host HW is backed by a VINTF (HYP_OWN=1) - Guest HW is backed by a VINTF (HYP_OWN=0) The kernel driver has an implementation requirement of the HYP_OWN bit in the VM. So, VMM must follow that to allow the same copy of Linux to work. Add this requirement to the uAPI, which is currently missing. Fixes: 4dc0d12474f9 ("iommu/tegra241-cmdqv: Add user-space use support") Signed-off-by: Nicolin Chen Reviewed-by: Eric Auger Reviewed-by: Jason Gunthorpe Signed-off-by: Will Deacon --- include/uapi/linux/iommufd.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 1dafbc552d37..f63edbe71d54 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -1052,6 +1052,11 @@ struct iommu_fault_alloc { enum iommu_viommu_type { IOMMU_VIOMMU_TYPE_DEFAULT = 0, IOMMU_VIOMMU_TYPE_ARM_SMMUV3 = 1, + /* + * TEGRA241_CMDQV requirements (otherwise, VCMDQs will not work) + * - Kernel will allocate a VINTF (HYP_OWN=0) to back this VIOMMU. So, + * VMM must wire the HYP_OWN bit to 0 in guest VINTF_CONFIG register + */ IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 2, }; -- cgit v1.2.3 From 90c5def10bea574b101b7a520c015ca81742183f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 2 Mar 2026 18:22:52 -0400 Subject: iommu: Do not call drivers for empty gathers An empty gather is coded with start=U64_MAX, end=0 and several drivers go on to convert that to a size with: end - start + 1 Which gives 2 for an empty gather. This then causes Weird Stuff to happen (for example an UBSAN splat in VT-d) that is hopefully harmless, but maybe not. Prevent drivers from being called right in iommu_iotlb_sync(). Auditing shows that AMD, Intel, Mediatek and RSIC-V drivers all do things on these empty gathers. Further, there are several callers that can trigger empty gathers, especially in unusual conditions. For example iommu_map_nosync() will call a 0 size unmap on some error paths. Also in VFIO, iommupt and other places. Cc: stable@vger.kernel.org Reported-by: Janusz Krzysztofik Closes: https://lore.kernel.org/r/11145826.aFP6jjVeTY@jkrzyszt-mobl2.ger.corp.intel.com Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Samiullah Khawaja Reviewed-by: Robin Murphy Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel --- include/linux/iommu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 54b8b48c762e..555597b54083 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -980,7 +980,8 @@ static inline void iommu_flush_iotlb_all(struct iommu_domain *domain) static inline void iommu_iotlb_sync(struct iommu_domain *domain, struct iommu_iotlb_gather *iotlb_gather) { - if (domain->ops->iotlb_sync) + if (domain->ops->iotlb_sync && + likely(iotlb_gather->start < iotlb_gather->end)) domain->ops->iotlb_sync(domain, iotlb_gather); iommu_iotlb_gather_init(iotlb_gather); -- cgit v1.2.3