diff options
36 files changed, 2722 insertions, 218 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 7f04491ca5f0..ee9e2a2edbf5 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -7,6 +7,10 @@ config IOMMU_IOVA config IOMMU_API bool +config IOMMUFD_DRIVER + bool + default n + menuconfig IOMMU_SUPPORT bool "IOMMU Hardware Support" depends on MMU diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig index 9b5fc3356bf2..8bd4c3b183ec 100644 --- a/drivers/iommu/amd/Kconfig +++ b/drivers/iommu/amd/Kconfig @@ -10,6 +10,7 @@ config AMD_IOMMU select IOMMU_API select IOMMU_IOVA select IOMMU_IO_PGTABLE + select IOMMUFD_DRIVER if IOMMUFD depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE help With this option you can enable support for AMD IOMMU hardware in diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 7dc30c2b56b3..dec4e5c2b66b 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -97,7 +97,9 @@ #define FEATURE_GATS_MASK (3ULL) #define FEATURE_GAM_VAPIC BIT_ULL(21) #define FEATURE_GIOSUP BIT_ULL(48) +#define FEATURE_HASUP BIT_ULL(49) #define FEATURE_EPHSUP BIT_ULL(50) +#define FEATURE_HDSUP BIT_ULL(52) #define FEATURE_SNP BIT_ULL(63) #define FEATURE_PASID_SHIFT 32 @@ -212,6 +214,7 @@ /* macros and definitions for device table entries */ #define DEV_ENTRY_VALID 0x00 #define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_HAD 0x07 #define DEV_ENTRY_PPR 0x34 #define DEV_ENTRY_IR 0x3d #define DEV_ENTRY_IW 0x3e @@ -371,9 +374,15 @@ (1ULL << (12 + (9 * (level)))) /* + * The IOPTE dirty bit + */ +#define IOMMU_PTE_HD_BIT (6) + +/* * Bit value definition for I/O PTE fields */ #define IOMMU_PTE_PR BIT_ULL(0) +#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) #define IOMMU_PTE_U BIT_ULL(59) #define IOMMU_PTE_FC BIT_ULL(60) #define IOMMU_PTE_IR BIT_ULL(61) @@ -384,6 +393,7 @@ */ #define DTE_FLAG_V BIT_ULL(0) #define DTE_FLAG_TV BIT_ULL(1) +#define DTE_FLAG_HAD (3ULL << 7) #define DTE_FLAG_GIOV BIT_ULL(54) #define DTE_FLAG_GV BIT_ULL(55) #define DTE_GLX_SHIFT (56) @@ -413,6 +423,7 @@ #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) +#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) @@ -563,6 +574,7 @@ struct protection_domain { int nid; /* Node ID */ u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ + bool dirty_tracking; /* dirty tracking is enabled in the domain */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ }; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index 2892aa1b4dc1..6c0621f6f572 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo return (__pte & ~offset_mask) | (iova & offset_mask); } +static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, + unsigned long flags) +{ + bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; + bool dirty = false; + int i, count; + + /* + * 2.2.3.2 Host Dirty Support + * When a non-default page size is used , software must OR the + * Dirty bits in all of the replicated host PTEs used to map + * the page. The IOMMU does not guarantee the Dirty bits are + * set in all of the replicated PTEs. Any portion of the page + * may have been written even if the Dirty bit is set in only + * one of the replicated PTEs. + */ + count = PAGE_SIZE_PTE_COUNT(size); + for (i = 0; i < count && test_only; i++) { + if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { + dirty = true; + break; + } + } + + for (i = 0; i < count && !test_only; i++) { + if (test_and_clear_bit(IOMMU_PTE_HD_BIT, + (unsigned long *)&ptep[i])) { + dirty = true; + } + } + + return dirty; +} + +static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); + unsigned long end = iova + size - 1; + + do { + unsigned long pgsize = 0; + u64 *ptep, pte; + + ptep = fetch_pte(pgtable, iova, &pgsize); + if (ptep) + pte = READ_ONCE(*ptep); + if (!ptep || !IOMMU_PTE_PRESENT(pte)) { + pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); + iova += pgsize; + continue; + } + + /* + * Mark the whole IOVA range as dirty even if only one of + * the replicated PTEs were marked dirty. + */ + if (pte_test_and_clear_dirty(ptep, pgsize, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + /* * ---------------------------------------------------- */ @@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo pgtable->iop.ops.map_pages = iommu_v1_map_pages; pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; + pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; return &pgtable->iop; } diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 95bd7c25ba6f..b399c5741378 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -37,6 +37,7 @@ #include <asm/iommu.h> #include <asm/gart.h> #include <asm/dma.h> +#include <uapi/linux/iommufd.h> #include "amd_iommu.h" #include "../dma-iommu.h" @@ -65,6 +66,7 @@ LIST_HEAD(hpet_map); LIST_HEAD(acpihid_map); const struct iommu_ops amd_iommu_ops; +const struct iommu_dirty_ops amd_dirty_ops; static ATOMIC_NOTIFIER_HEAD(ppr_notifier); int amd_iommu_max_glx_val = -1; @@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid, pte_root |= 1ULL << DEV_ENTRY_PPR; } + if (domain->dirty_tracking) + pte_root |= DTE_FLAG_HAD; + if (domain->flags & PD_IOMMUV2_MASK) { u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); u64 glx = domain->glx; @@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void) return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1); } -static struct iommu_domain *amd_iommu_domain_alloc(unsigned type) +static bool amd_iommu_hd_support(struct amd_iommu *iommu) { + return iommu && (iommu->features & FEATURE_HDSUP); +} + +static struct iommu_domain *do_iommu_domain_alloc(unsigned int type, + struct device *dev, u32 flags) +{ + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; struct protection_domain *domain; + struct amd_iommu *iommu = NULL; + + if (dev) { + iommu = rlookup_amd_iommu(dev); + if (!iommu) + return ERR_PTR(-ENODEV); + } /* * Since DTE[Mode]=0 is prohibited on SNP-enabled system, * default to use IOMMU_DOMAIN_DMA[_FQ]. */ if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY)) - return NULL; + return ERR_PTR(-EINVAL); + + if (dirty_tracking && !amd_iommu_hd_support(iommu)) + return ERR_PTR(-EOPNOTSUPP); domain = protection_domain_alloc(type); if (!domain) - return NULL; + return ERR_PTR(-ENOMEM); domain->domain.geometry.aperture_start = 0; domain->domain.geometry.aperture_end = dma_max_address(); domain->domain.geometry.force_aperture = true; + if (iommu) { + domain->domain.type = type; + domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap; + domain->domain.ops = iommu->iommu.ops->default_domain_ops; + + if (dirty_tracking) + domain->domain.dirty_ops = &amd_dirty_ops; + } + return &domain->domain; } +static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type) +{ + struct iommu_domain *domain; + + domain = do_iommu_domain_alloc(type, NULL, 0); + if (IS_ERR(domain)) + return NULL; + + return domain; +} + +static struct iommu_domain * +amd_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) + +{ + unsigned int type = IOMMU_DOMAIN_UNMANAGED; + + if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data) + return ERR_PTR(-EOPNOTSUPP); + + return do_iommu_domain_alloc(type, dev, flags); +} + static void amd_iommu_domain_free(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, dev_data->defer_attach = false; + /* + * Restrict to devices with compatible IOMMU hardware support + * when enforcement of dirty tracking is enabled. + */ + if (dom->dirty_ops && !amd_iommu_hd_support(iommu)) + return -EINVAL; + if (dev_data->domain) detach_device(dev); @@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return true; case IOMMU_CAP_DEFERRED_FLUSH: return true; + case IOMMU_CAP_DIRTY_TRACKING: { + struct amd_iommu *iommu = rlookup_amd_iommu(dev); + + return amd_iommu_hd_support(iommu); + } default: break; } @@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; } +static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct dev_table_entry *dev_table; + struct iommu_dev_data *dev_data; + bool domain_flush = false; + struct amd_iommu *iommu; + unsigned long flags; + u64 pte_root; + + spin_lock_irqsave(&pdomain->lock, flags); + if (!(pdomain->dirty_tracking ^ enable)) { + spin_unlock_irqrestore(&pdomain->lock, flags); + return 0; + } + + list_for_each_entry(dev_data, &pdomain->dev_list, list) { + iommu = rlookup_amd_iommu(dev_data->dev); + if (!iommu) + continue; + + dev_table = get_dev_table(iommu); + pte_root = dev_table[dev_data->devid].data[0]; + + pte_root = (enable ? pte_root | DTE_FLAG_HAD : + pte_root & ~DTE_FLAG_HAD); + + /* Flush device DTE */ + dev_table[dev_data->devid].data[0] = pte_root; + device_flush_dte(dev_data); + domain_flush = true; + } + + /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */ + if (domain_flush) { + amd_iommu_domain_flush_tlb_pde(pdomain); + amd_iommu_domain_flush_complete(pdomain); + } + pdomain->dirty_tracking = enable; + spin_unlock_irqrestore(&pdomain->lock, flags); + + return 0; +} + +static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct protection_domain *pdomain = to_pdomain(domain); + struct io_pgtable_ops *ops = &pdomain->iop.iop.ops; + unsigned long lflags; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + spin_lock_irqsave(&pdomain->lock, lflags); + if (!pdomain->dirty_tracking && dirty->bitmap) { + spin_unlock_irqrestore(&pdomain->lock, lflags); + return -EINVAL; + } + spin_unlock_irqrestore(&pdomain->lock, lflags); + + return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); +} + static void amd_iommu_get_resv_regions(struct device *dev, struct list_head *head) { @@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain) return true; } +const struct iommu_dirty_ops amd_dirty_ops = { + .set_dirty_tracking = amd_iommu_set_dirty_tracking, + .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, +}; + const struct iommu_ops amd_iommu_ops = { .capable = amd_iommu_capable, .domain_alloc = amd_iommu_domain_alloc, + .domain_alloc_user = amd_iommu_domain_alloc_user, .probe_device = amd_iommu_probe_device, .release_device = amd_iommu_release_device, .probe_finalize = amd_iommu_probe_finalize, diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig index 119d2c57a48e..012cd2541a68 100644 --- a/drivers/iommu/intel/Kconfig +++ b/drivers/iommu/intel/Kconfig @@ -15,6 +15,7 @@ config INTEL_IOMMU select DMA_OPS select IOMMU_API select IOMMU_IOVA + select IOMMUFD_DRIVER if IOMMUFD select NEED_DMA_MAP_STATE select DMAR_TABLE select SWIOTLB diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 7af3b8a4f2a0..5dabf081a779 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_DMAR_TABLE) += dmar.o -obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o +obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o obj-$(CONFIG_DMAR_PERF) += perf.o obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 3685ba90ec88..d1037280abf7 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units); #define for_each_rmrr_units(rmrr) \ list_for_each_entry(rmrr, &dmar_rmrr_units, list) -static void device_block_translation(struct device *dev); static void intel_iommu_domain_free(struct iommu_domain *domain); int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON); @@ -300,6 +299,7 @@ static int iommu_skip_te_disable; #define IDENTMAP_AZALIA 4 const struct iommu_ops intel_iommu_ops; +const struct iommu_dirty_ops intel_dirty_ops; static bool translation_pre_enabled(struct intel_iommu *iommu) { @@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) } /* Some capabilities may be different across iommus */ -static void domain_update_iommu_cap(struct dmar_domain *domain) +void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL); @@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type) return domain; } -static int domain_attach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info, *curr; unsigned long ndomains; @@ -1828,8 +1827,7 @@ err_unlock: return ret; } -static void domain_detach_iommu(struct dmar_domain *domain, - struct intel_iommu *iommu) +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) { struct iommu_domain_info *info; @@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) return -EINVAL; + if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { + pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); + return -EINVAL; + } + attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); attr |= DMA_FL_PTE_PRESENT; if (domain->use_first_level) { @@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev) * all DMA requests without PASID from the device are blocked. If the page * table has been set, clean up the data structures. */ -static void device_block_translation(struct device *dev) +void device_block_translation(struct device *dev) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; @@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return NULL; } +static struct iommu_domain * +intel_iommu_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; + struct intel_iommu *iommu = info->iommu; + struct iommu_domain *domain; + + /* Must be NESTING domain */ + if (parent) { + if (!nested_supported(iommu) || flags) + return ERR_PTR(-EOPNOTSUPP); + return intel_nested_domain_alloc(parent, user_data); + } + + if (flags & + (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (nested_parent && !nested_supported(iommu)) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (dirty_tracking && !ssads_supported(iommu))) + return ERR_PTR(-EOPNOTSUPP); + + /* + * domain_alloc_user op needs to fully initialize a domain before + * return, so uses iommu_domain_alloc() here for simple. + */ + domain = iommu_domain_alloc(dev->bus); + if (!domain) + return ERR_PTR(-ENOMEM); + + if (nested_parent) + to_dmar_domain(domain)->nested_parent = true; + + if (dirty_tracking) { + if (to_dmar_domain(domain)->use_first_level) { + iommu_domain_free(domain); + return ERR_PTR(-EOPNOTSUPP); + } + domain->dirty_ops = &intel_dirty_ops; + } + + return domain; +} + static void intel_iommu_domain_free(struct iommu_domain *domain) { if (domain != &si_domain->domain && domain != &blocking_domain) domain_exit(to_dmar_domain(domain)); } -static int prepare_domain_attach_device(struct iommu_domain *domain, - struct device *dev) +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev) { struct dmar_domain *dmar_domain = to_dmar_domain(domain); struct intel_iommu *iommu; @@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) return -EINVAL; + if (domain->dirty_ops && !ssads_supported(iommu)) + return -EINVAL; + /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) @@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) return dmar_platform_optin(); case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: return ecap_sc_support(info->iommu->ecap); + case IOMMU_CAP_DIRTY_TRACKING: + return ssads_supported(info->iommu); default: return false; } @@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain, if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev)) return -EOPNOTSUPP; + if (domain->dirty_ops) + return -EINVAL; + if (context_copied(iommu, info->bus, info->devfn)) return -EBUSY; @@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) if (!vtd) return ERR_PTR(-ENOMEM); + vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17; vtd->cap_reg = iommu->cap; vtd->ecap_reg = iommu->ecap; *length = sizeof(*vtd); @@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type) return vtd; } +static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, + bool enable) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct device_domain_info *info; + int ret; + + spin_lock(&dmar_domain->lock); + if (dmar_domain->dirty_tracking == enable) + goto out_unlock; + + list_for_each_entry(info, &dmar_domain->devices, link) { + ret = intel_pasid_setup_dirty_tracking(info->iommu, + info->domain, info->dev, + IOMMU_NO_PASID, enable); + if (ret) + goto err_unwind; + } + + dmar_domain->dirty_tracking = enable; +out_unlock: + spin_unlock(&dmar_domain->lock); + + return 0; + +err_unwind: + list_for_each_entry(info, &dmar_domain->devices, link) + intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain, + info->dev, IOMMU_NO_PASID, + dmar_domain->dirty_tracking); + spin_unlock(&dmar_domain->lock); + return ret; +} + +static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + unsigned long end = iova + size - 1; + unsigned long pgsize; + + /* + * IOMMUFD core calls into a dirty tracking disabled domain without an + * IOVA bitmap set in order to clean dirty bits in all PTEs that might + * have occurred when we stopped dirty tracking. This ensures that we + * never inherit dirtied bits from a previous cycle. + */ + if (!dmar_domain->dirty_tracking && dirty->bitmap) + return -EINVAL; + + do { + struct dma_pte *pte; + int lvl = 0; + + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, + GFP_ATOMIC); + pgsize = level_size(lvl) << VTD_PAGE_SHIFT; + if (!pte || !dma_pte_present(pte)) { + iova += pgsize; + continue; + } + + if (dma_sl_pte_test_and_clear_dirty(pte, flags)) + iommu_dirty_bitmap_record(dirty, iova, pgsize); + iova += pgsize; + } while (iova < end); + + return 0; +} + +const struct iommu_dirty_ops intel_dirty_ops = { + .set_dirty_tracking = intel_iommu_set_dirty_tracking, + .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, +}; + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .hw_info = intel_iommu_hw_info, .domain_alloc = intel_iommu_domain_alloc, + .domain_alloc_user = intel_iommu_domain_alloc_user, .probe_device = intel_iommu_probe_device, .probe_finalize = intel_iommu_probe_finalize, .release_device = intel_iommu_release_device, diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 7dac94f62b4e..d796d0d9b114 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -25,6 +25,7 @@ #include <asm/cacheflush.h> #include <asm/iommu.h> +#include <uapi/linux/iommufd.h> /* * VT-d hardware uses 4KiB page size regardless of host page size. @@ -48,6 +49,9 @@ #define DMA_FL_PTE_DIRTY BIT_ULL(6) #define DMA_FL_PTE_XD BIT_ULL(63) +#define DMA_SL_PTE_DIRTY_BIT 9 +#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT) + #define ADDR_WIDTH_5LEVEL (57) #define ADDR_WIDTH_4LEVEL (48) @@ -539,6 +543,10 @@ enum { #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) +#define ssads_supported(iommu) (sm_supported(iommu) && \ + ecap_slads((iommu)->ecap)) +#define nested_supported(iommu) (sm_supported(iommu) && \ + ecap_nest((iommu)->ecap)) struct pasid_entry; struct pasid_state_entry; @@ -592,20 +600,45 @@ struct dmar_domain { * otherwise, goes through the second * level. */ + u8 dirty_tracking:1; /* Dirty tracking is enabled */ + u8 nested_parent:1; /* Has other domains nested on it */ spinlock_t lock; /* Protect device tracking lists */ struct list_head devices; /* all devices' list */ struct list_head dev_pasids; /* all attached pasids */ - struct dma_pte *pgd; /* virtual address */ - int gaw; /* max guest address width */ - - /* adjusted guest address width, 0 is level 2 30-bit */ - int agaw; int iommu_superpage;/* Level of superpages supported: 0 == 4KiB (no superpages), 1 == 2MiB, 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ - u64 max_addr; /* maximum mapped address */ + union { + /* DMA remapping domain */ + struct { + /* virtual address */ + struct dma_pte *pgd; + /* max guest address width */ + int gaw; + /* + * adjusted guest address width: + * 0: level 2 30-bit + * 1: level 3 39-bit + * 2: level 4 48-bit + * 3: level 5 57-bit + */ + int agaw; + /* maximum mapped address */ + u64 max_addr; + }; + + /* Nested user domain */ + struct { + /* parent page table which the user domain is nested on */ + struct dmar_domain *s2_domain; + /* user page table pointer (in GPA) */ + unsigned long s1_pgtbl; + /* page table attributes */ + struct iommu_hwpt_vtd_s1 s1_cfg; + }; + }; struct iommu_domain domain; /* generic domain data structure for iommu core */ @@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, + unsigned long flags) +{ + if (flags & IOMMU_DIRTY_NO_CLEAR) + return (pte->val & DMA_SL_PTE_DIRTY) != 0; + + return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, + (unsigned long *)&pte->val); +} + static inline bool dma_pte_superpage(struct dma_pte *pte) { return (pte->val & DMA_PTE_LARGE_PAGE); @@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc, */ #define QI_OPT_WAIT_DRAIN BIT(0) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +void device_block_translation(struct device *dev); +int prepare_domain_attach_device(struct iommu_domain *domain, + struct device *dev); +void domain_update_iommu_cap(struct dmar_domain *domain); + int dmar_ir_support(void); void *alloc_pgtable_page(int node, gfp_t gfp); void free_pgtable_page(void *vaddr); void iommu_flush_write_buffer(struct intel_iommu *iommu); struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data); #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c new file mode 100644 index 000000000000..b5a5563ab32c --- /dev/null +++ b/drivers/iommu/intel/nested.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * nested.c - nested mode translation support + * + * Copyright (C) 2023 Intel Corporation + * + * Author: Lu Baolu <baolu.lu@linux.intel.com> + * Jacob Pan <jacob.jun.pan@linux.intel.com> + * Yi Liu <yi.l.liu@intel.com> + */ + +#define pr_fmt(fmt) "DMAR: " fmt + +#include <linux/iommu.h> +#include <linux/pci.h> +#include <linux/pci-ats.h> + +#include "iommu.h" +#include "pasid.h" + +static int intel_nested_attach_dev(struct iommu_domain *domain, + struct device *dev) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct dmar_domain *dmar_domain = to_dmar_domain(domain); + struct intel_iommu *iommu = info->iommu; + unsigned long flags; + int ret = 0; + + if (info->domain) + device_block_translation(dev); + + if (iommu->agaw < dmar_domain->s2_domain->agaw) { + dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); + return -ENODEV; + } + + /* + * Stage-1 domain cannot work alone, it is nested on a s2_domain. + * The s2_domain will be used in nested translation, hence needs + * to ensure the s2_domain is compatible with this IOMMU. + */ + ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev); + if (ret) { + dev_err_ratelimited(dev, "s2 domain is not compatible\n"); + return ret; + } + + ret = domain_attach_iommu(dmar_domain, iommu); + if (ret) { + dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); + return ret; + } + + ret = intel_pasid_setup_nested(iommu, dev, + IOMMU_NO_PASID, dmar_domain); + if (ret) { + domain_detach_iommu(dmar_domain, iommu); + dev_err_ratelimited(dev, "Failed to setup pasid entry\n"); + return ret; + } + + info->domain = dmar_domain; + spin_lock_irqsave(&dmar_domain->lock, flags); + list_add(&info->link, &dmar_domain->devices); + spin_unlock_irqrestore(&dmar_domain->lock, flags); + + return 0; +} + +static void intel_nested_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_nested_domain_ops = { + .attach_dev = intel_nested_attach_dev, + .free = intel_nested_domain_free, +}; + +struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct dmar_domain *s2_domain = to_dmar_domain(parent); + struct iommu_hwpt_vtd_s1 vtd; + struct dmar_domain *domain; + int ret; + + /* Must be nested domain */ + if (user_data->type != IOMMU_HWPT_DATA_VTD_S1) + return ERR_PTR(-EOPNOTSUPP); + if (parent->ops != intel_iommu_ops.default_domain_ops || + !s2_domain->nested_parent) + return ERR_PTR(-EINVAL); + + ret = iommu_copy_struct_from_user(&vtd, user_data, + IOMMU_HWPT_DATA_VTD_S1, __reserved); + if (ret) + return ERR_PTR(ret); + + domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT); + if (!domain) + return ERR_PTR(-ENOMEM); + + domain->use_first_level = true; + domain->s2_domain = s2_domain; + domain->s1_pgtbl = vtd.pgtbl_addr; + domain->s1_cfg = vtd; + domain->domain.ops = &intel_nested_domain_ops; + domain->domain.type = IOMMU_DOMAIN_NESTED; + INIT_LIST_HEAD(&domain->devices); + INIT_LIST_HEAD(&domain->dev_pasids); + spin_lock_init(&domain->lock); + xa_init(&domain->iommu_array); + + return &domain->domain; +} diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index 8f92b92f3d2a..74e8e4c17e81 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits) WRITE_ONCE(*ptr, (old & ~mask) | bits); } +static inline u64 pasid_get_bits(u64 *ptr) +{ + return READ_ONCE(*ptr); +} + /* * Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode * PASID entry. @@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe) } /* + * Enable second level A/D bits by setting the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_set_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9); +} + +/* + * Disable second level A/D bits by clearing the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry. + */ +static inline void pasid_clear_ssade(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[0], 1 << 9, 0); +} + +/* + * Checks if second level A/D bits specifically the SLADE (Second Level + * Access Dirty Enable) field (Bit 9) of a scalable mode PASID + * entry is set. + */ +static inline bool pasid_get_ssade(struct pasid_entry *pe) +{ + return pasid_get_bits(&pe->val[0]) & (1 << 9); +} + +/* + * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a + * scalable mode PASID entry. + */ +static inline void pasid_set_sre(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 0, 1); +} + +/* * Setup the WPE(Write Protect Enable) field (Bit 132) of a * scalable mode PASID entry. */ @@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value) pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2); } +/* + * Setup the Extended Access Flag Enable (EAFE) field (Bit 135) + * of a scalable mode PASID entry. + */ +static inline void pasid_set_eafe(struct pasid_entry *pe) +{ + pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7); +} + static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu, u16 did, u32 pasid) @@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); pasid_set_fault_enable(pte); pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + if (domain->dirty_tracking) + pasid_set_ssade(pte); pasid_set_present(pte); spin_unlock(&iommu->lock); @@ -637,6 +692,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu, } /* + * Set up dirty tracking on a second only or nested translation type. + */ +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid, + bool enabled) +{ + struct pasid_entry *pte; + u16 did, pgtt; + + spin_lock(&iommu->lock); + + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, "Failed to get pasid entry of PASID %d\n", pasid); + return -ENODEV; + } + + did = domain_id_iommu(domain, iommu); + pgtt = pasid_pte_get_pgtt(pte); + if (pgtt != PASID_ENTRY_PGTT_SL_ONLY && + pgtt != PASID_ENTRY_PGTT_NESTED) { + spin_unlock(&iommu->lock); + dev_err_ratelimited( + dev, + "Dirty tracking not supported on translation type %d\n", + pgtt); + return -EOPNOTSUPP; + } + + if (pasid_get_ssade(pte) == enabled) { + spin_unlock(&iommu->lock); + return 0; + } + + if (enabled) + pasid_set_ssade(pte); + else + pasid_clear_ssade(pte); + spin_unlock(&iommu->lock); + + if (!ecap_coherent(iommu->ecap)) + clflush_cache_range(pte, sizeof(*pte)); + + /* + * From VT-d spec table 25 "Guidance to Software for Invalidations": + * + * - PASID-selective-within-Domain PASID-cache invalidation + * If (PGTT=SS or Nested) + * - Domain-selective IOTLB invalidation + * Else + * - PASID-selective PASID-based IOTLB invalidation + * - If (pasid is RID_PASID) + * - Global Device-TLB invalidation to affected functions + * Else + * - PASID-based Device-TLB invalidation (with S=1 and + * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions + */ + pasid_cache_invalidation_with_pasid(iommu, did, pasid); + + iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH); + + /* Device IOTLB doesn't need to be flushed in caching mode. */ + if (!cap_caching_mode(iommu->cap)) + devtlb_invalidation_with_pasid(iommu, dev, pasid); + + return 0; +} + +/* * Set up the scalable mode pasid entry for passthrough translation type. */ int intel_pasid_setup_pass_through(struct intel_iommu *iommu, @@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu, if (!cap_caching_mode(iommu->cap)) devtlb_invalidation_with_pasid(iommu, dev, pasid); } + +/** + * intel_pasid_setup_nested() - Set up PASID entry for nested translation. + * @iommu: IOMMU which the device belong to + * @dev: Device to be set up for translation + * @pasid: PASID to be programmed in the device PASID table + * @domain: User stage-1 domain nested on a stage-2 domain + * + * This is used for nested translation. The input domain should be + * nested type and nested on a parent with 'is_nested_parent' flag + * set. + */ +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain) +{ + struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg; + pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl; + struct dmar_domain *s2_domain = domain->s2_domain; + u16 did = domain_id_iommu(domain, iommu); + struct dma_pte *pgd = s2_domain->pgd; + struct pasid_entry *pte; + + /* Address width should match the address width supported by hardware */ + switch (s1_cfg->addr_width) { + case ADDR_WIDTH_4LEVEL: + break; + case ADDR_WIDTH_5LEVEL: + if (!cap_fl5lp_support(iommu->cap)) { + dev_err_ratelimited(dev, + "5-level paging not supported\n"); + return -EINVAL; + } + break; + default: + dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n", + s1_cfg->addr_width); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) { + pr_err_ratelimited("No supervisor request support on %s\n", + iommu->name); + return -EINVAL; + } + + if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) { + pr_err_ratelimited("No extended access flag support on %s\n", + iommu->name); + return -EINVAL; + } + + spin_lock(&iommu->lock); + pte = intel_pasid_get_entry(dev, pasid); + if (!pte) { + spin_unlock(&iommu->lock); + return -ENODEV; + } + if (pasid_pte_is_present(pte)) { + spin_unlock(&iommu->lock); + return -EBUSY; + } + + pasid_clear_entry(pte); + + if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL) + pasid_set_flpm(pte, 1); + + pasid_set_flptr(pte, (uintptr_t)s1_gpgd); + + if (s1_cfg->flags & IOMMU_VTD_S1_SRE) { + pasid_set_sre(pte); + if (s1_cfg->flags & IOMMU_VTD_S1_WPE) + pasid_set_wpe(pte); + } + + if (s1_cfg->flags & IOMMU_VTD_S1_EAFE) + pasid_set_eafe(pte); + + if (s2_domain->force_snooping) + pasid_set_pgsnp(pte); + + pasid_set_slptr(pte, virt_to_phys(pgd)); + pasid_set_fault_enable(pte); + pasid_set_domain_id(pte, did); + pasid_set_address_width(pte, s2_domain->agaw); + pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); + pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED); + pasid_set_present(pte); + spin_unlock(&iommu->lock); + + pasid_flush_caches(iommu, pte, pasid, did); + + return 0; +} diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 4e9e68c3c388..dd37611175cc 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu, int intel_pasid_setup_second_level(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu, + struct dmar_domain *domain, + struct device *dev, u32 pasid, + bool enabled); int intel_pasid_setup_pass_through(struct intel_iommu *iommu, struct dmar_domain *domain, struct device *dev, u32 pasid); +int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev, + u32 pasid, struct dmar_domain *domain); void intel_pasid_tear_down_entry(struct intel_iommu *iommu, struct device *dev, u32 pasid, bool fault_ignore); diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 8aeba81800c5..34b446146961 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -11,3 +11,4 @@ iommufd-y := \ iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o obj-$(CONFIG_IOMMUFD) += iommufd.o +obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ce78c3671539..59d3a07300d9 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev) EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); static int iommufd_group_setup_msi(struct iommufd_group *igroup, - struct iommufd_hw_pagetable *hwpt) + struct iommufd_hwpt_paging *hwpt_paging) { phys_addr_t sw_msi_start = igroup->sw_msi_start; int rc; @@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * matches what the IRQ layer actually expects in a newly created * domain. */ - if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { - rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt_paging->common.domain, + sw_msi_start); if (rc) return rc; @@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup, * iommu_get_msi_cookie() can only be called once per domain, * it returns -EBUSY on later calls. */ - hwpt->msi_cookie = true; + hwpt_paging->msi_cookie = true; + } + return 0; +} + +static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging, + struct iommufd_device *idev) +{ + int rc; + + lockdep_assert_held(&idev->igroup->lock); + + rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt, + idev->dev, + &idev->igroup->sw_msi_start); + if (rc) + return rc; + + if (list_empty(&idev->igroup->device_list)) { + rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging); + if (rc) { + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, + idev->dev); + return rc; + } } return 0; } @@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, goto err_unlock; } - /* Try to upgrade the domain we have */ - if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + if (hwpt_is_paging(hwpt)) { + rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev); if (rc) goto err_unlock; } - rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev, - &idev->igroup->sw_msi_start); - if (rc) - goto err_unlock; - /* * Only attach to the group once for the first device that is in the * group. All the other devices will follow this attachment. The user @@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, * attachment. */ if (list_empty(&idev->igroup->device_list)) { - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_attach_group(hwpt->domain, idev->igroup->group); if (rc) goto err_unresv; @@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, mutex_unlock(&idev->igroup->lock); return 0; err_unresv: - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); err_unlock: mutex_unlock(&idev->igroup->lock); return rc; @@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev) iommu_detach_group(hwpt->domain, idev->igroup->group); idev->igroup->hwpt = NULL; } - iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + if (hwpt_is_paging(hwpt)) + iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt, + idev->dev); mutex_unlock(&idev->igroup->lock); /* Caller must destroy hwpt */ @@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev, return NULL; } +static void +iommufd_group_remove_reserved_iova(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_device *cur; + + lockdep_assert_held(&igroup->lock); + + list_for_each_entry(cur, &igroup->device_list, group_item) + iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev); +} + +static int +iommufd_group_do_replace_paging(struct iommufd_group *igroup, + struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt; + struct iommufd_device *cur; + int rc; + + lockdep_assert_held(&igroup->lock); + + if (!hwpt_is_paging(old_hwpt) || + hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) { + list_for_each_entry(cur, &igroup->device_list, group_item) { + rc = iopt_table_enforce_dev_resv_regions( + &hwpt_paging->ioas->iopt, cur->dev, NULL); + if (rc) + goto err_unresv; + } + } + + rc = iommufd_group_setup_msi(igroup, hwpt_paging); + if (rc) + goto err_unresv; + return 0; + +err_unresv: + iommufd_group_remove_reserved_iova(igroup, hwpt_paging); + return rc; +} + static struct iommufd_hw_pagetable * iommufd_device_do_replace(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt) { struct iommufd_group *igroup = idev->igroup; struct iommufd_hw_pagetable *old_hwpt; - unsigned int num_devices = 0; - struct iommufd_device *cur; + unsigned int num_devices; int rc; mutex_lock(&idev->igroup->lock); @@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev, return NULL; } - /* Try to upgrade the domain we have */ - list_for_each_entry(cur, &igroup->device_list, group_item) { - num_devices++; - if (cur->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); - if (rc) - goto err_unlock; - } - } - old_hwpt = igroup->hwpt; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) { - rc = iopt_table_enforce_dev_resv_regions( - &hwpt->ioas->iopt, cur->dev, NULL); - if (rc) - goto err_unresv; - } + if (hwpt_is_paging(hwpt)) { + rc = iommufd_group_do_replace_paging(igroup, + to_hwpt_paging(hwpt)); + if (rc) + goto err_unlock; } - rc = iommufd_group_setup_msi(idev->igroup, hwpt); - if (rc) - goto err_unresv; - rc = iommu_group_replace_domain(igroup->group, hwpt->domain); if (rc) goto err_unresv; - if (hwpt->ioas != old_hwpt->ioas) { - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&old_hwpt->ioas->iopt, - cur->dev); - } + if (hwpt_is_paging(old_hwpt) && + (!hwpt_is_paging(hwpt) || + to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); igroup->hwpt = hwpt; + num_devices = list_count_nodes(&igroup->device_list); /* * Move the refcounts held by the device_list to the new hwpt. Retain a * refcount for this thread as the caller will free it. @@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev, /* Caller must destroy old_hwpt */ return old_hwpt; err_unresv: - list_for_each_entry(cur, &igroup->device_list, group_item) - iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev); + if (hwpt_is_paging(hwpt)) + iommufd_group_remove_reserved_iova(igroup, + to_hwpt_paging(old_hwpt)); err_unlock: mutex_unlock(&idev->igroup->lock); return ERR_PTR(rc); @@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, */ bool immediate_attach = do_attach == iommufd_device_do_attach; struct iommufd_hw_pagetable *destroy_hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; /* @@ -515,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, * other. */ mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->auto_domain) + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->auto_domain) continue; + hwpt = &hwpt_paging->common; if (!iommufd_lock_obj(&hwpt->obj)) continue; destroy_hwpt = (*do_attach)(idev, hwpt); @@ -539,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, goto out_unlock; } - hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev, - immediate_attach); - if (IS_ERR(hwpt)) { - destroy_hwpt = ERR_CAST(hwpt); + hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0, + immediate_attach, NULL); + if (IS_ERR(hwpt_paging)) { + destroy_hwpt = ERR_CAST(hwpt_paging); goto out_unlock; } + hwpt = &hwpt_paging->common; if (!immediate_attach) { destroy_hwpt = (*do_attach)(idev, hwpt); @@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev, destroy_hwpt = NULL; } - hwpt->auto_domain = true; + hwpt_paging->auto_domain = true; *pt_id = hwpt->obj.id; iommufd_object_finalize(idev->ictx, &hwpt->obj); @@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id, return PTR_ERR(pt_obj); switch (pt_obj->type) { - case IOMMUFD_OBJ_HW_PAGETABLE: { + case IOMMUFD_OBJ_HWPT_NESTED: + case IOMMUFD_OBJ_HWPT_PAGING: { struct iommufd_hw_pagetable *hwpt = container_of(pt_obj, struct iommufd_hw_pagetable, obj); @@ -617,8 +667,8 @@ out_put_pt_obj: /** * iommufd_device_attach - Connect a device to an iommu_domain * @idev: device to attach - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This connects the device to an iommu_domain, either automatically or manually * selected. Once this completes the device could do DMA. @@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); /** * iommufd_device_replace - Change the device's iommu_domain * @idev: device to change - * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE - * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING + * Output the IOMMUFD_OBJ_HWPT_PAGING ID * * This is the same as:: * @@ -1185,6 +1235,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) */ cmd->data_len = data_len; + cmd->out_capabilities = 0; + if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING)) + cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); out_free: kfree(data); diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c index cf2c1504e20d..2abbeafdbd22 100644 --- a/drivers/iommu/iommufd/hw_pagetable.c +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -5,62 +5,87 @@ #include <linux/iommu.h> #include <uapi/linux/iommufd.h> +#include "../iommu-priv.h" #include "iommufd_private.h" -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); - if (!list_empty(&hwpt->hwpt_item)) { - mutex_lock(&hwpt->ioas->mutex); - list_del(&hwpt->hwpt_item); - mutex_unlock(&hwpt->ioas->mutex); + if (!list_empty(&hwpt_paging->hwpt_item)) { + mutex_lock(&hwpt_paging->ioas->mutex); + list_del(&hwpt_paging->hwpt_item); + mutex_unlock(&hwpt_paging->ioas->mutex); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - if (hwpt->domain) - iommu_domain_free(hwpt->domain); + if (hwpt_paging->common.domain) + iommu_domain_free(hwpt_paging->common.domain); - refcount_dec(&hwpt->ioas->obj.users); + refcount_dec(&hwpt_paging->ioas->obj.users); } -void iommufd_hw_pagetable_abort(struct iommufd_object *obj) +void iommufd_hwpt_paging_abort(struct iommufd_object *obj) { - struct iommufd_hw_pagetable *hwpt = - container_of(obj, struct iommufd_hw_pagetable, obj); + struct iommufd_hwpt_paging *hwpt_paging = + container_of(obj, struct iommufd_hwpt_paging, common.obj); /* The ioas->mutex must be held until finalize is called. */ - lockdep_assert_held(&hwpt->ioas->mutex); + lockdep_assert_held(&hwpt_paging->ioas->mutex); - if (!list_empty(&hwpt->hwpt_item)) { - list_del_init(&hwpt->hwpt_item); - iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + if (!list_empty(&hwpt_paging->hwpt_item)) { + list_del_init(&hwpt_paging->hwpt_item); + iopt_table_remove_domain(&hwpt_paging->ioas->iopt, + hwpt_paging->common.domain); } - iommufd_hw_pagetable_destroy(obj); + iommufd_hwpt_paging_destroy(obj); } -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj) { - if (hwpt->enforce_cache_coherency) + struct iommufd_hwpt_nested *hwpt_nested = + container_of(obj, struct iommufd_hwpt_nested, common.obj); + + if (hwpt_nested->common.domain) + iommu_domain_free(hwpt_nested->common.domain); + + refcount_dec(&hwpt_nested->parent->common.obj.users); +} + +void iommufd_hwpt_nested_abort(struct iommufd_object *obj) +{ + iommufd_hwpt_nested_destroy(obj); +} + +static int +iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging) +{ + struct iommu_domain *paging_domain = hwpt_paging->common.domain; + + if (hwpt_paging->enforce_cache_coherency) return 0; - if (hwpt->domain->ops->enforce_cache_coherency) - hwpt->enforce_cache_coherency = - hwpt->domain->ops->enforce_cache_coherency( - hwpt->domain); - if (!hwpt->enforce_cache_coherency) + if (paging_domain->ops->enforce_cache_coherency) + hwpt_paging->enforce_cache_coherency = + paging_domain->ops->enforce_cache_coherency( + paging_domain); + if (!hwpt_paging->enforce_cache_coherency) return -EINVAL; return 0; } /** - * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device * @ictx: iommufd context * @ioas: IOAS to associate the domain with * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace * @immediate_attach: True if idev should be attached to the hwpt + * @user_data: The user provided driver specific data describing the domain to + * create * * Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT * will be linked to the given ioas and upon return the underlying iommu_domain @@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt) * iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on * the returned hwpt. */ -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach) +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data) { + const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_hw_pagetable *hwpt; int rc; lockdep_assert_held(&ioas->mutex); - hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); - if (IS_ERR(hwpt)) - return hwpt; + if ((flags || user_data) && !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (flags & ~valid_flags) + return ERR_PTR(-EOPNOTSUPP); + + hwpt_paging = __iommufd_object_alloc( + ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj); + if (IS_ERR(hwpt_paging)) + return ERR_CAST(hwpt_paging); + hwpt = &hwpt_paging->common; - INIT_LIST_HEAD(&hwpt->hwpt_item); + INIT_LIST_HEAD(&hwpt_paging->hwpt_item); /* Pairs with iommufd_hw_pagetable_destroy() */ refcount_inc(&ioas->obj.users); - hwpt->ioas = ioas; + hwpt_paging->ioas = ioas; + hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; - hwpt->domain = iommu_domain_alloc(idev->dev->bus); - if (!hwpt->domain) { - rc = -ENOMEM; - goto out_abort; + if (ops->domain_alloc_user) { + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL, + user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + } else { + hwpt->domain = iommu_domain_alloc(idev->dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } } /* @@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, * doing any maps. It is an iommu driver bug to report * IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on * a new domain. + * + * The cache coherency mode must be configured here and unchanged later. + * Note that a HWPT (non-CC) created for a device (non-CC) can be later + * reused by another device (either non-CC or CC). However, A HWPT (CC) + * created for a device (CC) cannot be reused by another device (non-CC) + * but only devices (CC). Instead user space in this case would need to + * allocate a separate HWPT (non-CC). */ if (idev->enforce_cache_coherency) { - rc = iommufd_hw_pagetable_enforce_cc(hwpt); + rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging); if (WARN_ON(rc)) goto out_abort; } @@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, goto out_abort; } - rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain); if (rc) goto out_detach; - list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); - return hwpt; + list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list); + return hwpt_paging; out_detach: if (immediate_attach) @@ -133,32 +189,120 @@ out_abort: return ERR_PTR(rc); } +/** + * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device + * @ictx: iommufd context + * @parent: Parent PAGING-type hwpt to associate the domain with + * @idev: Device to get an iommu_domain for + * @flags: Flags from userspace + * @user_data: user_data pointer. Must be valid + * + * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as + * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of + * being a parent. + */ +static struct iommufd_hwpt_nested * +iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx, + struct iommufd_hwpt_paging *parent, + struct iommufd_device *idev, u32 flags, + const struct iommu_user_data *user_data) +{ + const struct iommu_ops *ops = dev_iommu_ops(idev->dev); + struct iommufd_hwpt_nested *hwpt_nested; + struct iommufd_hw_pagetable *hwpt; + int rc; + + if (flags || !user_data->len || !ops->domain_alloc_user) + return ERR_PTR(-EOPNOTSUPP); + if (parent->auto_domain || !parent->nest_parent) + return ERR_PTR(-EINVAL); + + hwpt_nested = __iommufd_object_alloc( + ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj); + if (IS_ERR(hwpt_nested)) + return ERR_CAST(hwpt_nested); + hwpt = &hwpt_nested->common; + + refcount_inc(&parent->common.obj.users); + hwpt_nested->parent = parent; + + hwpt->domain = ops->domain_alloc_user(idev->dev, flags, + parent->common.domain, user_data); + if (IS_ERR(hwpt->domain)) { + rc = PTR_ERR(hwpt->domain); + hwpt->domain = NULL; + goto out_abort; + } + + if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) { + rc = -EINVAL; + goto out_abort; + } + return hwpt_nested; + +out_abort: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) { struct iommu_hwpt_alloc *cmd = ucmd->cmd; + const struct iommu_user_data user_data = { + .type = cmd->data_type, + .uptr = u64_to_user_ptr(cmd->data_uptr), + .len = cmd->data_len, + }; struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas = NULL; + struct iommufd_object *pt_obj; struct iommufd_device *idev; - struct iommufd_ioas *ioas; int rc; - if (cmd->flags || cmd->__reserved) + if (cmd->__reserved) return -EOPNOTSUPP; + if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len) + return -EINVAL; idev = iommufd_get_device(ucmd, cmd->dev_id); if (IS_ERR(idev)) return PTR_ERR(idev); - ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id); - if (IS_ERR(ioas)) { - rc = PTR_ERR(ioas); + pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) { + rc = -EINVAL; goto out_put_idev; } - mutex_lock(&ioas->mutex); - hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false); - if (IS_ERR(hwpt)) { - rc = PTR_ERR(hwpt); - goto out_unlock; + if (pt_obj->type == IOMMUFD_OBJ_IOAS) { + struct iommufd_hwpt_paging *hwpt_paging; + + ioas = container_of(pt_obj, struct iommufd_ioas, obj); + mutex_lock(&ioas->mutex); + hwpt_paging = iommufd_hwpt_paging_alloc( + ucmd->ictx, ioas, idev, cmd->flags, false, + user_data.len ? &user_data : NULL); + if (IS_ERR(hwpt_paging)) { + rc = PTR_ERR(hwpt_paging); + goto out_unlock; + } + hwpt = &hwpt_paging->common; + } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_nested *hwpt_nested; + + hwpt_nested = iommufd_hwpt_nested_alloc( + ucmd->ictx, + container_of(pt_obj, struct iommufd_hwpt_paging, + common.obj), + idev, cmd->flags, &user_data); + if (IS_ERR(hwpt_nested)) { + rc = PTR_ERR(hwpt_nested); + goto out_unlock; + } + hwpt = &hwpt_nested->common; + } else { + rc = -EINVAL; + goto out_put_pt; } cmd->out_hwpt_id = hwpt->obj.id; @@ -171,9 +315,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd) out_hwpt: iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj); out_unlock: - mutex_unlock(&ioas->mutex); - iommufd_put_object(&ioas->obj); + if (ioas) + mutex_unlock(&ioas->mutex); +out_put_pt: + iommufd_put_object(pt_obj); out_put_idev: iommufd_put_object(&idev->obj); return rc; } + +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + bool enable; + + if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE) + return rc; + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); + + ioas = hwpt_paging->ioas; + enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE; + + rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain, + enable); + + iommufd_put_object(&hwpt_paging->common.obj); + return rc; +} + +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ioas *ioas; + int rc = -EOPNOTSUPP; + + if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) || + cmd->__reserved) + return -EOPNOTSUPP; + + hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_paging)) + return PTR_ERR(hwpt_paging); + + ioas = hwpt_paging->ioas; + rc = iopt_read_and_clear_dirty_data( + &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd); + + iommufd_put_object(&hwpt_paging->common.obj); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 3a598182b761..504ac1b01b2d 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -15,6 +15,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/errno.h> +#include <uapi/linux/iommufd.h> #include "io_pagetable.h" #include "double_span.h" @@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, return 0; } +static struct iopt_area *iopt_area_alloc(void) +{ + struct iopt_area *area; + + area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!area) + return NULL; + RB_CLEAR_NODE(&area->node.rb); + RB_CLEAR_NODE(&area->pages_node.rb); + return area; +} + static int iopt_alloc_area_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, @@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt, int rc = 0; list_for_each_entry(elm, pages_list, next) { - elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); + elm->area = iopt_area_alloc(); if (!elm->area) return -ENOMEM; } @@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, return 0; } +struct iova_bitmap_fn_arg { + unsigned long flags; + struct io_pagetable *iopt; + struct iommu_domain *domain; + struct iommu_dirty_bitmap *dirty; +}; + +static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, + unsigned long iova, size_t length, + void *opaque) +{ + struct iopt_area *area; + struct iopt_area_contig_iter iter; + struct iova_bitmap_fn_arg *arg = opaque; + struct iommu_domain *domain = arg->domain; + struct iommu_dirty_bitmap *dirty = arg->dirty; + const struct iommu_dirty_ops *ops = domain->dirty_ops; + unsigned long last_iova = iova + length - 1; + unsigned long flags = arg->flags; + int ret; + + iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + ret = ops->read_and_clear_dirty(domain, iter.cur_iova, + last - iter.cur_iova + 1, flags, + dirty); + if (ret) + return ret; + } + + if (!iopt_area_contig_done(&iter)) + return -EINVAL; + return 0; +} + +static int +iommu_read_and_clear_dirty(struct iommu_domain *domain, + struct io_pagetable *iopt, unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iova_bitmap_fn_arg arg; + struct iova_bitmap *iter; + int ret = 0; + + if (!ops || !ops->read_and_clear_dirty) + return -EOPNOTSUPP; + + iter = iova_bitmap_alloc(bitmap->iova, bitmap->length, + bitmap->page_size, + u64_to_user_ptr(bitmap->data)); + if (IS_ERR(iter)) + return -ENOMEM; + + iommu_dirty_bitmap_init(&dirty, iter, &gather); + + arg.flags = flags; + arg.iopt = iopt; + arg.domain = domain; + arg.dirty = &dirty; + iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty); + + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) + iommu_iotlb_sync(domain, &gather); + + iova_bitmap_free(iter); + + return ret; +} + +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + size_t iommu_pgsize = iopt->iova_alignment; + u64 last_iova; + + if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) + return -EOVERFLOW; + + if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) + return -EOVERFLOW; + + if ((bitmap->iova & (iommu_pgsize - 1)) || + ((last_iova + 1) & (iommu_pgsize - 1))) + return -EINVAL; + + if (!bitmap->page_size) + return -EINVAL; + + if ((bitmap->iova & (bitmap->page_size - 1)) || + ((last_iova + 1) & (bitmap->page_size - 1))) + return -EINVAL; + + return 0; +} + +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap) +{ + int ret; + + ret = iommufd_check_iova_range(iopt, bitmap); + if (ret) + return ret; + + down_read(&iopt->iova_rwsem); + ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); + up_read(&iopt->iova_rwsem); + + return ret; +} + +static int iopt_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + struct iommu_iotlb_gather gather; + struct iommu_dirty_bitmap dirty; + struct iopt_area *area; + int ret = 0; + + lockdep_assert_held_read(&iopt->iova_rwsem); + + iommu_dirty_bitmap_init(&dirty, NULL, &gather); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (!area->pages) + continue; + + ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), + iopt_area_length(area), 0, + &dirty); + if (ret) + break; + } + + iommu_iotlb_sync(domain, &gather); + return ret; +} + +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable) +{ + const struct iommu_dirty_ops *ops = domain->dirty_ops; + int ret = 0; + + if (!ops) + return -EOPNOTSUPP; + + down_read(&iopt->iova_rwsem); + + /* Clear dirty bits from PTEs to ensure a clean snapshot */ + if (enable) { + ret = iopt_clear_dirty_data(iopt, domain); + if (ret) + goto out_unlock; + } + + ret = ops->set_dirty_tracking(domain, enable); + +out_unlock: + up_read(&iopt->iova_rwsem); + return ret; +} + int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list) { @@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) iopt_area_start_byte(area, new_start) & (alignment - 1)) return -EINVAL; - lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + lhs = iopt_area_alloc(); if (!lhs) return -ENOMEM; - rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + rhs = iopt_area_alloc(); if (!rhs) { rc = -ENOMEM; goto err_free_lhs; @@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova) if (WARN_ON(rc)) goto err_remove_lhs; + /* + * If the original area has filled a domain, domains_itree has to be + * updated. + */ + if (area->storage_domain) { + interval_tree_remove(&area->pages_node, &pages->domains_itree); + interval_tree_insert(&lhs->pages_node, &pages->domains_itree); + interval_tree_insert(&rhs->pages_node, &pages->domains_itree); + } + lhs->storage_domain = area->storage_domain; lhs->pages = area->pages; rhs->storage_domain = area->storage_domain; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 2c58670011fe..a74cfefffbc6 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -8,6 +8,9 @@ #include <linux/xarray.h> #include <linux/refcount.h> #include <linux/uaccess.h> +#include <linux/iommu.h> +#include <linux/iova_bitmap.h> +#include <uapi/linux/iommufd.h> struct iommu_domain; struct iommu_group; @@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); +int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, + struct iommu_domain *domain, + unsigned long flags, + struct iommu_hwpt_get_dirty_bitmap *bitmap); +int iopt_set_dirty_tracking(struct io_pagetable *iopt, + struct iommu_domain *domain, bool enable); + void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, @@ -113,7 +123,8 @@ enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, - IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_HWPT_PAGING, + IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST @@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); -#define iommufd_object_alloc(ictx, ptr, type) \ +#define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ @@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, type), \ typeof(*(ptr)), obj) +#define iommufd_object_alloc(ictx, ptr, type) \ + __iommufd_object_alloc(ictx, ptr, type, obj) + /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The @@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); +int iommufd_check_iova_range(struct io_pagetable *iopt, + struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object @@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); */ struct iommufd_hw_pagetable { struct iommufd_object obj; - struct iommufd_ioas *ioas; struct iommu_domain *domain; +}; + +struct iommufd_hwpt_paging { + struct iommufd_hw_pagetable common; + struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; + bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; -struct iommufd_hw_pagetable * -iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, - struct iommufd_device *idev, bool immediate_attach); -int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt); +struct iommufd_hwpt_nested { + struct iommufd_hw_pagetable common; + struct iommufd_hwpt_paging *parent; +}; + +static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) +{ + return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; +} + +static inline struct iommufd_hwpt_paging * +to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) +{ + return container_of(hwpt, struct iommufd_hwpt_paging, common); +} + +static inline struct iommufd_hwpt_paging * +iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_HWPT_PAGING), + struct iommufd_hwpt_paging, common.obj); +} +int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); +int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); + +struct iommufd_hwpt_paging * +iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct iommufd_device *idev, u32 flags, + bool immediate_attach, + const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); -void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); -void iommufd_hw_pagetable_abort(struct iommufd_object *obj); +void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); +void iommufd_hwpt_paging_abort(struct iommufd_object *obj); +void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); +void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { - lockdep_assert_not_held(&hwpt->ioas->mutex); - if (hwpt->auto_domain) - iommufd_object_deref_user(ictx, &hwpt->obj); - else - refcount_dec(&hwpt->obj.users); + if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { + struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); + + lockdep_assert_not_held(&hwpt_paging->ioas->mutex); + + if (hwpt_paging->auto_domain) { + iommufd_object_deref_user(ictx, &hwpt->obj); + return; + } + } + refcount_dec(&hwpt->obj.users); } struct iommufd_group { diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h index 3f3644375bf1..7910fbe1962d 100644 --- a/drivers/iommu/iommufd/iommufd_test.h +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -19,6 +19,8 @@ enum { IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE, IOMMU_TEST_OP_ACCESS_REPLACE_IOAS, + IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, + IOMMU_TEST_OP_DIRTY, }; enum { @@ -40,6 +42,15 @@ enum { MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, }; +enum { + MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, +}; + +enum { + MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3, + MOCK_NESTED_DOMAIN_IOTLB_NUM = 4, +}; + struct iommu_test_cmd { __u32 size; __u32 op; @@ -57,6 +68,13 @@ struct iommu_test_cmd { __u32 out_idev_id; } mock_domain; struct { + __u32 out_stdev_id; + __u32 out_hwpt_id; + __u32 out_idev_id; + /* Expand mock_domain to set mock device flags */ + __u32 dev_flags; + } mock_domain_flags; + struct { __u32 pt_id; } mock_domain_replace; struct { @@ -95,6 +113,14 @@ struct iommu_test_cmd { struct { __u32 ioas_id; } access_replace_ioas; + struct { + __u32 flags; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 uptr; + __aligned_u64 out_nr_dirty; + } dirty; }; __u32 last; }; @@ -109,4 +135,17 @@ struct iommu_test_hw_info { __u32 test_reg; }; +/* Should not be equal to any defined value in enum iommu_hwpt_data_type */ +#define IOMMU_HWPT_DATA_SELFTEST 0xdead +#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef + +/** + * struct iommu_hwpt_selftest + * + * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT + */ +struct iommu_hwpt_selftest { + __u32 iotlb; +}; + #endif diff --git a/drivers/vfio/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c index 0848f920efb7..0a92c9eeaf7f 100644 --- a/drivers/vfio/iova_bitmap.c +++ b/drivers/iommu/iommufd/iova_bitmap.c @@ -268,6 +268,7 @@ err: iova_bitmap_free(bitmap); return ERR_PTR(rc); } +EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD); /** * iova_bitmap_free() - Frees an IOVA bitmap object @@ -289,6 +290,7 @@ void iova_bitmap_free(struct iova_bitmap *bitmap) kfree(bitmap); } +EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD); /* * Returns the remaining bitmap indexes from mapped_total_index to process for @@ -387,6 +389,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, return ret; } +EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD); /** * iova_bitmap_set() - Records an IOVA range in bitmap @@ -420,4 +423,4 @@ void iova_bitmap_set(struct iova_bitmap *bitmap, cur_bit += nbits; } while (cur_bit <= last_bit); } -EXPORT_SYMBOL_GPL(iova_bitmap_set); +EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD); diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index e71523cbd0de..45b9d40773b1 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -307,6 +307,8 @@ union ucmd_buffer { struct iommu_destroy destroy; struct iommu_hw_info info; struct iommu_hwpt_alloc hwpt; + struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; + struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; struct iommu_ioas_copy ioas_copy; @@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc, __reserved), + IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap, + struct iommu_hwpt_get_dirty_bitmap, data), + IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking, + struct iommu_hwpt_set_dirty_tracking, __reserved), IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, struct iommu_ioas_alloc, out_ioas_id), IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, @@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = { [IOMMUFD_OBJ_IOAS] = { .destroy = iommufd_ioas_destroy, }, - [IOMMUFD_OBJ_HW_PAGETABLE] = { - .destroy = iommufd_hw_pagetable_destroy, - .abort = iommufd_hw_pagetable_abort, + [IOMMUFD_OBJ_HWPT_PAGING] = { + .destroy = iommufd_hwpt_paging_destroy, + .abort = iommufd_hwpt_paging_abort, + }, + [IOMMUFD_OBJ_HWPT_NESTED] = { + .destroy = iommufd_hwpt_nested_destroy, + .abort = iommufd_hwpt_nested_abort, }, #ifdef CONFIG_IOMMUFD_TEST [IOMMUFD_OBJ_SELFTEST] = { @@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR); MODULE_ALIAS("devname:vfio/vfio"); #endif MODULE_IMPORT_NS(IOMMUFD_INTERNAL); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 8d9aa297c117..528f356238b3 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) area, domain, iopt_area_index(area), iopt_area_last_index(area)); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb)); interval_tree_remove(&area->pages_node, &pages->domains_itree); iopt_area_unfill_domain(area, pages, area->storage_domain); area->storage_domain = NULL; diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 56506d5753f1..d43a87737c1e 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -20,10 +20,13 @@ static DECLARE_FAULT_ATTR(fail_iommufd); static struct dentry *dbgfs_root; static struct platform_device *selftest_iommu_dev; +static const struct iommu_ops mock_ops; +static struct iommu_domain_ops domain_nested_ops; size_t iommufd_test_memory_limit = 65536; enum { + MOCK_DIRTY_TRACK = 1, MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, /* @@ -36,6 +39,7 @@ enum { _MOCK_PFN_START = MOCK_PFN_MASK + 1, MOCK_PFN_START_IOVA = _MOCK_PFN_START, MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, + MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, }; /* @@ -86,16 +90,24 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, } struct mock_iommu_domain { + unsigned long flags; struct iommu_domain domain; struct xarray pfns; }; +struct mock_iommu_domain_nested { + struct iommu_domain domain; + struct mock_iommu_domain *parent; + u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM]; +}; + enum selftest_obj_type { TYPE_IDEV, }; struct mock_dev { struct device dev; + unsigned long flags; }; struct selftest_obj { @@ -118,6 +130,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain) static int mock_domain_nop_attach(struct iommu_domain *domain, struct device *dev) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY)) + return -EINVAL; + return 0; } @@ -146,15 +163,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type) return info; } -static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +static int mock_domain_set_dirty_tracking(struct iommu_domain *domain, + bool enable) { - struct mock_iommu_domain *mock; + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = mock->flags; - if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) - return &mock_blocking_domain; + if (enable && !domain->dirty_ops) + return -EINVAL; - if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED) - return NULL; + /* No change? */ + if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK))) + return 0; + + flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK); + + mock->flags = flags; + return 0; +} + +static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long i, max = size / MOCK_IO_PAGE_SIZE; + void *ent, *old; + + if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) + return -EINVAL; + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE; + + ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); + if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) { + /* Clear dirty */ + if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { + unsigned long val; + + val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, + cur / MOCK_IO_PAGE_SIZE, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + } + iommu_dirty_bitmap_record(dirty, cur, + MOCK_IO_PAGE_SIZE); + } + } + + return 0; +} + +const struct iommu_dirty_ops dirty_ops = { + .set_dirty_tracking = mock_domain_set_dirty_tracking, + .read_and_clear_dirty = mock_domain_read_and_clear_dirty, +}; + +static struct iommu_domain *mock_domain_alloc_paging(struct device *dev) +{ + struct mock_iommu_domain *mock; mock = kzalloc(sizeof(*mock), GFP_KERNEL); if (!mock) @@ -162,10 +234,87 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) mock->domain.geometry.aperture_start = MOCK_APERTURE_START; mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + mock->domain.ops = mock_ops.default_domain_ops; + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; xa_init(&mock->pfns); return &mock->domain; } +static struct iommu_domain * +__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent, + const struct iommu_hwpt_selftest *user_cfg) +{ + struct mock_iommu_domain_nested *mock_nested; + int i; + + mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL); + if (!mock_nested) + return ERR_PTR(-ENOMEM); + mock_nested->parent = mock_parent; + mock_nested->domain.ops = &domain_nested_ops; + mock_nested->domain.type = IOMMU_DOMAIN_NESTED; + for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++) + mock_nested->iotlb[i] = user_cfg->iotlb; + return &mock_nested->domain; +} + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED) + return &mock_blocking_domain; + if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED) + return mock_domain_alloc_paging(NULL); + return NULL; +} + +static struct iommu_domain * +mock_domain_alloc_user(struct device *dev, u32 flags, + struct iommu_domain *parent, + const struct iommu_user_data *user_data) +{ + struct mock_iommu_domain *mock_parent; + struct iommu_hwpt_selftest user_cfg; + int rc; + + /* must be mock_domain */ + if (!parent) { + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; + bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; + struct iommu_domain *domain; + + if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT | + IOMMU_HWPT_ALLOC_DIRTY_TRACKING))) + return ERR_PTR(-EOPNOTSUPP); + if (user_data || (has_dirty_flag && no_dirty_ops)) + return ERR_PTR(-EOPNOTSUPP); + domain = mock_domain_alloc_paging(NULL); + if (!domain) + return ERR_PTR(-ENOMEM); + if (has_dirty_flag) + container_of(domain, struct mock_iommu_domain, domain) + ->domain.dirty_ops = &dirty_ops; + return domain; + } + + /* must be mock_domain_nested */ + if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags) + return ERR_PTR(-EOPNOTSUPP); + if (!parent || parent->ops != mock_ops.default_domain_ops) + return ERR_PTR(-EINVAL); + + mock_parent = container_of(parent, struct mock_iommu_domain, domain); + if (!mock_parent) + return ERR_PTR(-EINVAL); + + rc = iommu_copy_struct_from_user(&user_cfg, user_data, + IOMMU_HWPT_DATA_SELFTEST, iotlb); + if (rc) + return ERR_PTR(rc); + + return __mock_domain_alloc_nested(mock_parent, &user_cfg); +} + static void mock_domain_free(struct iommu_domain *domain) { struct mock_iommu_domain *mock = @@ -243,7 +392,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); - WARN_ON(!ent); + /* * iommufd generates unmaps that must be a strict * superset of the map's performend So every starting @@ -253,13 +402,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain, * passed to map_pages */ if (first) { - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_START_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); first = false; } if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) - WARN_ON(!(xa_to_value(ent) & - MOCK_PFN_LAST_IOVA)); + WARN_ON(ent && !(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); iova += MOCK_IO_PAGE_SIZE; ret += MOCK_IO_PAGE_SIZE; @@ -283,7 +432,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) { - return cap == IOMMU_CAP_CACHE_COHERENCY; + struct mock_dev *mdev = container_of(dev, struct mock_dev, dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return true; + case IOMMU_CAP_DIRTY_TRACKING: + return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY); + default: + break; + } + + return false; } static void mock_domain_set_plaform_dma_ops(struct device *dev) @@ -307,6 +467,7 @@ static const struct iommu_ops mock_ops = { .pgsize_bitmap = MOCK_IO_PAGE_SIZE, .hw_info = mock_domain_hw_info, .domain_alloc = mock_domain_alloc, + .domain_alloc_user = mock_domain_alloc_user, .capable = mock_domain_capable, .set_platform_dma_ops = mock_domain_set_plaform_dma_ops, .device_group = generic_device_group, @@ -321,19 +482,41 @@ static const struct iommu_ops mock_ops = { }, }; +static void mock_domain_free_nested(struct iommu_domain *domain) +{ + struct mock_iommu_domain_nested *mock_nested = + container_of(domain, struct mock_iommu_domain_nested, domain); + + kfree(mock_nested); +} + +static struct iommu_domain_ops domain_nested_ops = { + .free = mock_domain_free_nested, + .attach_dev = mock_domain_nop_attach, +}; + static inline struct iommufd_hw_pagetable * -get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, - struct mock_iommu_domain **mock) +__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type) { - struct iommufd_hw_pagetable *hwpt; struct iommufd_object *obj; - obj = iommufd_get_object(ucmd->ictx, mockpt_id, - IOMMUFD_OBJ_HW_PAGETABLE); + obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type); if (IS_ERR(obj)) return ERR_CAST(obj); - hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); - if (hwpt->domain->ops != mock_ops.default_domain_ops) { + return container_of(obj, struct iommufd_hw_pagetable, obj); +} + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || + hwpt->domain->ops != mock_ops.default_domain_ops) { iommufd_put_object(&hwpt->obj); return ERR_PTR(-EINVAL); } @@ -341,6 +524,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, return hwpt; } +static inline struct iommufd_hw_pagetable * +get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain_nested **mock_nested) +{ + struct iommufd_hw_pagetable *hwpt; + + hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED); + if (IS_ERR(hwpt)) + return hwpt; + if (hwpt->domain->type != IOMMU_DOMAIN_NESTED || + hwpt->domain->ops != &domain_nested_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock_nested = container_of(hwpt->domain, + struct mock_iommu_domain_nested, domain); + return hwpt; +} + struct mock_bus_type { struct bus_type bus; struct notifier_block nb; @@ -362,16 +564,20 @@ static void mock_dev_release(struct device *dev) kfree(mdev); } -static struct mock_dev *mock_dev_create(void) +static struct mock_dev *mock_dev_create(unsigned long dev_flags) { struct mock_dev *mdev; int rc; + if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY)) + return ERR_PTR(-EINVAL); + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(-ENOMEM); device_initialize(&mdev->dev); + mdev->flags = dev_flags; mdev->dev.release = mock_dev_release; mdev->dev.bus = &iommufd_mock_bus_type.bus; @@ -407,6 +613,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, struct iommufd_device *idev; struct selftest_obj *sobj; u32 pt_id = cmd->id; + u32 dev_flags = 0; u32 idev_id; int rc; @@ -417,7 +624,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, sobj->idev.ictx = ucmd->ictx; sobj->type = TYPE_IDEV; - sobj->idev.mock_dev = mock_dev_create(); + if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS) + dev_flags = cmd->mock_domain_flags.dev_flags; + + sobj->idev.mock_dev = mock_dev_create(dev_flags); if (IS_ERR(sobj->idev.mock_dev)) { rc = PTR_ERR(sobj->idev.mock_dev); goto out_sobj; @@ -977,6 +1187,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == __IOMMUFD_ACCESS_RW_SLOW_PATH); +static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id, + unsigned long iova, size_t length, + unsigned long page_size, void __user *uptr, + u32 flags) +{ + unsigned long bitmap_size, i, max; + struct iommu_test_cmd *cmd = ucmd->cmd; + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc, count = 0; + void *tmp; + + if (!page_size || !length || iova % page_size || length % page_size || + !uptr) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + if (!(mock->flags & MOCK_DIRTY_TRACK)) { + rc = -EINVAL; + goto out_put; + } + + max = length / page_size; + bitmap_size = max / BITS_PER_BYTE; + + tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (copy_from_user(tmp, uptr, bitmap_size)) { + rc = -EFAULT; + goto out_free; + } + + for (i = 0; i < max; i++) { + unsigned long cur = iova + i * page_size; + void *ent, *old; + + if (!test_bit(i, (unsigned long *)tmp)) + continue; + + ent = xa_load(&mock->pfns, cur / page_size); + if (ent) { + unsigned long val; + + val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; + old = xa_store(&mock->pfns, cur / page_size, + xa_mk_value(val), GFP_KERNEL); + WARN_ON_ONCE(ent != old); + count++; + } + } + + cmd->dirty.out_nr_dirty = count; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_free: + kvfree(tmp); +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + void iommufd_selftest_destroy(struct iommufd_object *obj) { struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); @@ -1000,6 +1277,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd) cmd->add_reserved.start, cmd->add_reserved.length); case IOMMU_TEST_OP_MOCK_DOMAIN: + case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS: return iommufd_test_mock_domain(ucmd, cmd); case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE: return iommufd_test_mock_domain_replace( @@ -1041,6 +1319,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd) return -EINVAL; iommufd_test_memory_limit = cmd->memory_limit.limit; return 0; + case IOMMU_TEST_OP_DIRTY: + return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova, + cmd->dirty.length, + cmd->dirty.page_size, + u64_to_user_ptr(cmd->dirty.uptr), + cmd->dirty.flags); default: return -EOPNOTSUPP; } diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index 6c810bf80f99..538fbf76354d 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -255,7 +255,7 @@ err_put: static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) { - struct iommufd_hw_pagetable *hwpt; + struct iommufd_hwpt_paging *hwpt_paging; struct iommufd_ioas *ioas; int rc = 1; @@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) return PTR_ERR(ioas); mutex_lock(&ioas->mutex); - list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { - if (!hwpt->enforce_cache_coherency) { + list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) { + if (!hwpt_paging->enforce_cache_coherency) { rc = 0; break; } diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index c82ea032d352..68c05705200f 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VFIO) += vfio.o -vfio-y += vfio_main.o \ - iova_bitmap.o +vfio-y += vfio_main.o vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o vfio-$(CONFIG_VFIO_GROUP) += group.o vfio-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/vfio/pci/mlx5/Kconfig b/drivers/vfio/pci/mlx5/Kconfig index 7088edc4fb28..c3ced56b7787 100644 --- a/drivers/vfio/pci/mlx5/Kconfig +++ b/drivers/vfio/pci/mlx5/Kconfig @@ -3,6 +3,7 @@ config MLX5_VFIO_PCI tristate "VFIO support for MLX5 PCI devices" depends on MLX5_CORE select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides migration support for MLX5 devices using the VFIO framework. diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b6ac66c5008d..fe09a8c8af95 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1517,6 +1517,7 @@ static struct pci_driver mlx5vf_pci_driver = { module_pci_driver(mlx5vf_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig index 6eceef7b028a..fec9b167c7b9 100644 --- a/drivers/vfio/pci/pds/Kconfig +++ b/drivers/vfio/pci/pds/Kconfig @@ -5,6 +5,7 @@ config PDS_VFIO_PCI tristate "VFIO support for PDS PCI devices" depends on PDS_CORE && PCI_IOV select VFIO_PCI_CORE + select IOMMUFD_DRIVER help This provides generic PCI support for PDS devices using the VFIO framework. diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index ab4b5958e413..dd8c00c895a2 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -204,6 +204,7 @@ static struct pci_driver pds_vfio_pci_driver = { module_pci_driver(pds_vfio_pci_driver); +MODULE_IMPORT_NS(IOMMUFD); MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index e31e1952d7b8..8d4995ada74a 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -1703,6 +1703,7 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); +MODULE_IMPORT_NS(IOMMUFD); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h index 1b7a44b35616..25142a0e2fc2 100644 --- a/include/linux/io-pgtable.h +++ b/include/linux/io-pgtable.h @@ -166,6 +166,10 @@ struct io_pgtable_ops { struct iommu_iotlb_gather *gather); phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops, unsigned long iova); + int (*read_and_clear_dirty)(struct io_pgtable_ops *ops, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); }; /** diff --git a/include/linux/iommu.h b/include/linux/iommu.h index c50a769d569a..8fb1b41b4d15 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -13,6 +13,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/of.h> +#include <linux/iova_bitmap.h> #include <uapi/linux/iommu.h> #define IOMMU_READ (1 << 0) @@ -37,6 +38,7 @@ struct bus_type; struct device; struct iommu_domain; struct iommu_domain_ops; +struct iommu_dirty_ops; struct notifier_block; struct iommu_sva; struct iommu_fault_event; @@ -65,6 +67,9 @@ struct iommu_domain_geometry { #define __IOMMU_DOMAIN_SVA (1U << 4) /* Shared process address space */ +#define __IOMMU_DOMAIN_NESTED (1U << 6) /* User-managed address space nested + on a stage-2 translation */ + #define IOMMU_DOMAIN_ALLOC_FLAGS ~__IOMMU_DOMAIN_DMA_FQ /* * This are the possible domain-types @@ -91,10 +96,13 @@ struct iommu_domain_geometry { __IOMMU_DOMAIN_DMA_API | \ __IOMMU_DOMAIN_DMA_FQ) #define IOMMU_DOMAIN_SVA (__IOMMU_DOMAIN_SVA) +#define IOMMU_DOMAIN_NESTED (__IOMMU_DOMAIN_NESTED) struct iommu_domain { unsigned type; const struct iommu_domain_ops *ops; + const struct iommu_dirty_ops *dirty_ops; + unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ struct iommu_domain_geometry geometry; struct iommu_dma_cookie *iova_cookie; @@ -133,6 +141,7 @@ enum iommu_cap { * usefully support the non-strict DMA flush queue. */ IOMMU_CAP_DEFERRED_FLUSH, + IOMMU_CAP_DIRTY_TRACKING, /* IOMMU supports dirty tracking */ }; /* These are the possible reserved region types */ @@ -228,13 +237,109 @@ struct iommu_iotlb_gather { }; /** + * struct iommu_dirty_bitmap - Dirty IOVA bitmap state + * @bitmap: IOVA bitmap + * @gather: Range information for a pending IOTLB flush + */ +struct iommu_dirty_bitmap { + struct iova_bitmap *bitmap; + struct iommu_iotlb_gather *gather; +}; + +/* Read but do not clear any dirty bits */ +#define IOMMU_DIRTY_NO_CLEAR (1 << 0) + +/** + * struct iommu_dirty_ops - domain specific dirty tracking operations + * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain + * @read_and_clear_dirty: Walk IOMMU page tables for dirtied PTEs marshalled + * into a bitmap, with a bit represented as a page. + * Reads the dirty PTE bits and clears it from IO + * pagetables. + */ +struct iommu_dirty_ops { + int (*set_dirty_tracking)(struct iommu_domain *domain, bool enabled); + int (*read_and_clear_dirty)(struct iommu_domain *domain, + unsigned long iova, size_t size, + unsigned long flags, + struct iommu_dirty_bitmap *dirty); +}; + +/** + * struct iommu_user_data - iommu driver specific user space data info + * @type: The data type of the user buffer + * @uptr: Pointer to the user buffer for copy_from_user() + * @len: The length of the user buffer in bytes + * + * A user space data is an uAPI that is defined in include/uapi/linux/iommufd.h + * @type, @uptr and @len should be just copied from an iommufd core uAPI struct. + */ +struct iommu_user_data { + unsigned int type; + void __user *uptr; + size_t len; +}; + +/** + * __iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @dst_data: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @src_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @dst_data. Must match with @src_data.type + * @data_len: Length of current user data structure, i.e. sizeof(struct _dst) + * @min_len: Initial length of user data structure for backward compatibility. + * This should be offsetofend using the last member in the user data + * struct that was initially added to include/uapi/linux/iommufd.h + */ +static inline int __iommu_copy_struct_from_user( + void *dst_data, const struct iommu_user_data *src_data, + unsigned int data_type, size_t data_len, size_t min_len) +{ + if (src_data->type != data_type) + return -EINVAL; + if (WARN_ON(!dst_data || !src_data)) + return -EINVAL; + if (src_data->len < min_len || data_len < src_data->len) + return -EINVAL; + return copy_struct_from_user(dst_data, data_len, src_data->uptr, + src_data->len); +} + +/** + * iommu_copy_struct_from_user - Copy iommu driver specific user space data + * @kdst: Pointer to an iommu driver specific user data that is defined in + * include/uapi/linux/iommufd.h + * @user_data: Pointer to a struct iommu_user_data for user space data info + * @data_type: The data type of the @kdst. Must match with @user_data->type + * @min_last: The last memember of the data structure @kdst points in the + * initial version. + * Return 0 for success, otherwise -error. + */ +#define iommu_copy_struct_from_user(kdst, user_data, data_type, min_last) \ + __iommu_copy_struct_from_user(kdst, user_data, data_type, \ + sizeof(*kdst), \ + offsetofend(typeof(*kdst), min_last)) + +/** * struct iommu_ops - iommu ops and capabilities * @capable: check capability * @hw_info: report iommu hardware information. The data buffer returned by this * op is allocated in the iommu driver and freed by the caller after * use. The information type is one of enum iommu_hw_info_type defined * in include/uapi/linux/iommufd.h. - * @domain_alloc: allocate iommu domain + * @domain_alloc: allocate and return an iommu domain if success. Otherwise + * NULL is returned. The domain is not fully initialized until + * the caller iommu_domain_alloc() returns. + * @domain_alloc_user: Allocate an iommu domain corresponding to the input + * parameters as defined in include/uapi/linux/iommufd.h. + * Unlike @domain_alloc, it is called only by IOMMUFD and + * must fully initialize the new domain before return. + * Upon success, if the @user_data is valid and the @parent + * points to a kernel-managed domain, the new domain must be + * IOMMU_DOMAIN_NESTED type; otherwise, the @parent must be + * NULL while the @user_data can be optionally provided, the + * new domain must support __IOMMU_DOMAIN_PAGING. + * Upon failure, ERR_PTR must be returned. * @probe_device: Add device to iommu driver handling * @release_device: Remove device from iommu driver handling * @probe_finalize: Do final setup work after the device is added to an IOMMU @@ -267,6 +372,9 @@ struct iommu_ops { /* Domain allocation and freeing by the iommu driver */ struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type); + struct iommu_domain *(*domain_alloc_user)( + struct device *dev, u32 flags, struct iommu_domain *parent, + const struct iommu_user_data *user_data); struct iommu_device *(*probe_device)(struct device *dev); void (*release_device)(struct device *dev); @@ -632,6 +740,28 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return gather && gather->queued; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ + if (gather) + iommu_iotlb_gather_init(gather); + + dirty->bitmap = bitmap; + dirty->gather = gather; +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ + if (dirty->bitmap) + iova_bitmap_set(dirty->bitmap, iova, length); + + if (dirty->gather) + iommu_iotlb_gather_add_range(dirty->gather, iova, length); +} + /* PCI device grouping function */ extern struct iommu_group *pci_device_group(struct device *dev); /* Generic device grouping function */ @@ -737,6 +867,8 @@ struct iommu_fwspec {}; struct iommu_device {}; struct iommu_fault_param {}; struct iommu_iotlb_gather {}; +struct iommu_dirty_bitmap {}; +struct iommu_dirty_ops {}; static inline bool iommu_present(const struct bus_type *bus) { @@ -969,6 +1101,18 @@ static inline bool iommu_iotlb_gather_queued(struct iommu_iotlb_gather *gather) return false; } +static inline void iommu_dirty_bitmap_init(struct iommu_dirty_bitmap *dirty, + struct iova_bitmap *bitmap, + struct iommu_iotlb_gather *gather) +{ +} + +static inline void iommu_dirty_bitmap_record(struct iommu_dirty_bitmap *dirty, + unsigned long iova, + unsigned long length) +{ +} + static inline void iommu_device_unregister(struct iommu_device *iommu) { } diff --git a/include/linux/iova_bitmap.h b/include/linux/iova_bitmap.h index c006cf0a25f3..1c338f5e5b7a 100644 --- a/include/linux/iova_bitmap.h +++ b/include/linux/iova_bitmap.h @@ -7,6 +7,7 @@ #define _IOVA_BITMAP_H_ #include <linux/types.h> +#include <linux/errno.h> struct iova_bitmap; @@ -14,6 +15,7 @@ typedef int (*iova_bitmap_fn_t)(struct iova_bitmap *bitmap, unsigned long iova, size_t length, void *opaque); +#if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length, unsigned long page_size, u64 __user *data); @@ -22,5 +24,29 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, iova_bitmap_fn_t fn); void iova_bitmap_set(struct iova_bitmap *bitmap, unsigned long iova, size_t length); +#else +static inline struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, + size_t length, + unsigned long page_size, + u64 __user *data) +{ + return NULL; +} + +static inline void iova_bitmap_free(struct iova_bitmap *bitmap) +{ +} + +static inline int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque, + iova_bitmap_fn_t fn) +{ + return -EOPNOTSUPP; +} + +static inline void iova_bitmap_set(struct iova_bitmap *bitmap, + unsigned long iova, size_t length) +{ +} +#endif #endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index b4ba0c0cbab6..0b2bc6252e2c 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -47,6 +47,8 @@ enum { IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, }; /** @@ -348,19 +350,85 @@ struct iommu_vfio_ioas { #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) /** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, +}; + +/** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -369,13 +437,26 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) /** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + +/** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec @@ -405,6 +486,20 @@ enum iommu_hw_info_type { }; /** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, +}; + +/** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) * @flags: Must be 0 @@ -415,6 +510,8 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -439,6 +536,81 @@ struct iommu_hw_info { __aligned_u64 data_uptr; __u32 out_data_type; __u32 __reserved; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + #endif diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index 33d08600be13..6ed328c863c4 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -86,12 +86,13 @@ TEST_F(iommufd, cmd_fail) TEST_F(iommufd, cmd_length) { -#define TEST_LENGTH(_struct, _ioctl) \ +#define TEST_LENGTH(_struct, _ioctl, _last) \ { \ + size_t min_size = offsetofend(struct _struct, _last); \ struct { \ struct _struct cmd; \ uint8_t extra; \ - } cmd = { .cmd = { .size = sizeof(struct _struct) - 1 }, \ + } cmd = { .cmd = { .size = min_size - 1 }, \ .extra = UINT8_MAX }; \ int old_errno; \ int rc; \ @@ -112,16 +113,19 @@ TEST_F(iommufd, cmd_length) } \ } - TEST_LENGTH(iommu_destroy, IOMMU_DESTROY); - TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO); - TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC); - TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES); - TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS); - TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP); - TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY); - TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP); - TEST_LENGTH(iommu_option, IOMMU_OPTION); - TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS); + TEST_LENGTH(iommu_destroy, IOMMU_DESTROY, id); + TEST_LENGTH(iommu_hw_info, IOMMU_GET_HW_INFO, __reserved); + TEST_LENGTH(iommu_hwpt_alloc, IOMMU_HWPT_ALLOC, __reserved); + TEST_LENGTH(iommu_ioas_alloc, IOMMU_IOAS_ALLOC, out_ioas_id); + TEST_LENGTH(iommu_ioas_iova_ranges, IOMMU_IOAS_IOVA_RANGES, + out_iova_alignment); + TEST_LENGTH(iommu_ioas_allow_iovas, IOMMU_IOAS_ALLOW_IOVAS, + allowed_iovas); + TEST_LENGTH(iommu_ioas_map, IOMMU_IOAS_MAP, iova); + TEST_LENGTH(iommu_ioas_copy, IOMMU_IOAS_COPY, src_iova); + TEST_LENGTH(iommu_ioas_unmap, IOMMU_IOAS_UNMAP, length); + TEST_LENGTH(iommu_option, IOMMU_OPTION, val64); + TEST_LENGTH(iommu_vfio_ioas, IOMMU_VFIO_IOAS, __reserved); #undef TEST_LENGTH } @@ -260,6 +264,121 @@ TEST_F(iommufd_ioas, ioas_destroy) } } +TEST_F(iommufd_ioas, alloc_hwpt_nested) +{ + const uint32_t min_data_len = + offsetofend(struct iommu_hwpt_selftest, iotlb); + struct iommu_hwpt_selftest data = { + .iotlb = IOMMU_TEST_IOTLB_DEFAULT, + }; + uint32_t nested_hwpt_id[2] = {}; + uint32_t parent_hwpt_id = 0; + uint32_t parent_hwpt_id_not_work = 0; + uint32_t test_hwpt_id = 0; + + if (self->device_id) { + /* Negative tests */ + test_err_hwpt_alloc(ENOENT, self->ioas_id, self->device_id, 0, + &test_hwpt_id); + test_err_hwpt_alloc(EINVAL, self->device_id, self->device_id, 0, + &test_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, + &parent_hwpt_id); + + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &parent_hwpt_id_not_work); + + /* Negative nested tests */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_NONE, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EOPNOTSUPP, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST + 1, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + min_data_len - 1); + test_err_hwpt_alloc_nested(EFAULT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, NULL, + sizeof(data)); + test_err_hwpt_alloc_nested( + EOPNOTSUPP, self->device_id, parent_hwpt_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, sizeof(data)); + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + parent_hwpt_id_not_work, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Allocate two nested hwpts sharing one common parent hwpt */ + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + + /* Negative test: a nested hwpt on top of a nested hwpt */ + test_err_hwpt_alloc_nested(EINVAL, self->device_id, + nested_hwpt_id[0], 0, &test_hwpt_id, + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + /* Negative test: parent hwpt now cannot be freed */ + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, parent_hwpt_id)); + + /* Attach device to nested_hwpt_id[0] that then will be busy */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[0]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[0])); + + /* Switch from nested_hwpt_id[0] to nested_hwpt_id[1] */ + test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id[1]); + EXPECT_ERRNO(EBUSY, + _test_ioctl_destroy(self->fd, nested_hwpt_id[1])); + test_ioctl_destroy(nested_hwpt_id[0]); + + /* Detach from nested_hwpt_id[1] and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id); + test_ioctl_destroy(nested_hwpt_id[1]); + + /* Detach from the parent hw_pagetable and destroy it */ + test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id); + test_ioctl_destroy(parent_hwpt_id); + test_ioctl_destroy(parent_hwpt_id_not_work); + } else { + test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0, + &parent_hwpt_id); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[0], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_hwpt_alloc_nested(ENOENT, self->device_id, + parent_hwpt_id, 0, + &nested_hwpt_id[1], + IOMMU_HWPT_DATA_SELFTEST, &data, + sizeof(data)); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[0]); + test_err_mock_domain_replace(ENOENT, self->stdev_id, + nested_hwpt_id[1]); + } +} + TEST_F(iommufd_ioas, hwpt_attach) { /* Create a device attached directly to a hwpt */ @@ -1404,16 +1523,242 @@ TEST_F(iommufd_mock_domain, alloc_hwpt) int i; for (i = 0; i != variant->mock_domains; i++) { + uint32_t hwpt_id[2]; uint32_t stddev_id; - uint32_t hwpt_id; - test_cmd_hwpt_alloc(self->idev_ids[0], self->ioas_id, &hwpt_id); - test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_err_hwpt_alloc(EOPNOTSUPP, + self->idev_ids[i], self->ioas_id, + ~IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[0]); + test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, + 0, &hwpt_id[0]); + test_cmd_hwpt_alloc(self->idev_ids[i], self->ioas_id, + IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id[1]); + + /* Do a hw_pagetable rotation test */ + test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[0]); + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[0])); + test_cmd_mock_domain_replace(self->stdev_ids[i], hwpt_id[1]); + EXPECT_ERRNO(EBUSY, _test_ioctl_destroy(self->fd, hwpt_id[1])); + test_cmd_mock_domain_replace(self->stdev_ids[i], self->ioas_id); + test_ioctl_destroy(hwpt_id[1]); + + test_cmd_mock_domain(hwpt_id[0], &stddev_id, NULL, NULL); test_ioctl_destroy(stddev_id); - test_ioctl_destroy(hwpt_id); + test_ioctl_destroy(hwpt_id[0]); } } +FIXTURE(iommufd_dirty_tracking) +{ + int fd; + uint32_t ioas_id; + uint32_t hwpt_id; + uint32_t stdev_id; + uint32_t idev_id; + unsigned long page_size; + unsigned long bitmap_size; + void *bitmap; + void *buffer; +}; + +FIXTURE_VARIANT(iommufd_dirty_tracking) +{ + unsigned long buffer_size; +}; + +FIXTURE_SETUP(iommufd_dirty_tracking) +{ + void *vrc; + int rc; + + self->fd = open("/dev/iommu", O_RDWR); + ASSERT_NE(-1, self->fd); + + rc = posix_memalign(&self->buffer, HUGEPAGE_SIZE, variant->buffer_size); + if (rc || !self->buffer) { + SKIP(return, "Skipping buffer_size=%lu due to errno=%d", + variant->buffer_size, rc); + } + + assert((uintptr_t)self->buffer % HUGEPAGE_SIZE == 0); + vrc = mmap(self->buffer, variant->buffer_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + assert(vrc == self->buffer); + + self->page_size = MOCK_PAGE_SIZE; + self->bitmap_size = + variant->buffer_size / self->page_size / BITS_PER_BYTE; + + /* Provision with an extra (MOCK_PAGE_SIZE) for the unaligned case */ + rc = posix_memalign(&self->bitmap, PAGE_SIZE, + self->bitmap_size + MOCK_PAGE_SIZE); + assert(!rc); + assert(self->bitmap); + assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); + + test_ioctl_ioas_alloc(&self->ioas_id); + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); +} + +FIXTURE_TEARDOWN(iommufd_dirty_tracking) +{ + munmap(self->buffer, variant->buffer_size); + munmap(self->bitmap, self->bitmap_size); + teardown_iommufd(self->fd, _metadata); +} + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k) +{ + /* one u32 index bitmap */ + .buffer_size = 128UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k) +{ + /* one u64 index bitmap */ + .buffer_size = 256UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k) +{ + /* two u64 index and trailing end bitmap */ + .buffer_size = 640UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M) +{ + /* 4K bitmap (128M IOVA range) */ + .buffer_size = 128UL * 1024UL * 1024UL, +}; + +FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M) +{ + /* 8K bitmap (256M IOVA range) */ + .buffer_size = 256UL * 1024UL * 1024UL, +}; + +TEST_F(iommufd_dirty_tracking, enforce_dirty) +{ + uint32_t ioas_id, stddev_id, idev_id; + uint32_t hwpt_id, _hwpt_id; + uint32_t dev_flags; + + /* Regular case */ + dev_flags = MOCK_FLAGS_DEVICE_NO_DIRTY; + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_err_mock_domain_flags(EINVAL, hwpt_id, dev_flags, &stddev_id, + NULL); + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); + + /* IOMMU device does not support dirty tracking */ + test_ioctl_ioas_alloc(&ioas_id); + test_cmd_mock_domain_flags(ioas_id, dev_flags, &stddev_id, &_hwpt_id, + &idev_id); + test_err_hwpt_alloc(EOPNOTSUPP, idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_ioctl_destroy(stddev_id); +} + +TEST_F(iommufd_dirty_tracking, set_dirty_tracking) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_cmd_set_dirty_tracking(hwpt_id, true); + test_cmd_set_dirty_tracking(hwpt_id, false); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + +TEST_F(iommufd_dirty_tracking, device_dirty_capability) +{ + uint32_t caps = 0; + uint32_t stddev_id; + uint32_t hwpt_id; + + test_cmd_hwpt_alloc(self->idev_id, self->ioas_id, 0, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + test_cmd_get_hw_capabilities(self->idev_id, caps, + IOMMU_HW_CAP_DIRTY_TRACKING); + ASSERT_EQ(IOMMU_HW_CAP_DIRTY_TRACKING, + caps & IOMMU_HW_CAP_DIRTY_TRACKING); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + +TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, + variant->buffer_size, MOCK_APERTURE_START); + + test_cmd_hwpt_alloc(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + + test_cmd_set_dirty_tracking(hwpt_id, true); + + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap, self->bitmap_size, 0, _metadata); + + /* PAGE_SIZE unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + MOCK_PAGE_SIZE, + self->bitmap_size, 0, _metadata); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + +TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) +{ + uint32_t stddev_id; + uint32_t hwpt_id; + uint32_t ioas_id; + + test_ioctl_ioas_alloc(&ioas_id); + test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, + variant->buffer_size, MOCK_APERTURE_START); + + test_cmd_hwpt_alloc(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + test_cmd_mock_domain(hwpt_id, &stddev_id, NULL, NULL); + + test_cmd_set_dirty_tracking(hwpt_id, true); + + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap, self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); + + /* Unaligned bitmap */ + test_mock_dirty_bitmaps(hwpt_id, variant->buffer_size, + MOCK_APERTURE_START, self->page_size, + self->bitmap + MOCK_PAGE_SIZE, + self->bitmap_size, + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR, + _metadata); + + test_ioctl_destroy(stddev_id); + test_ioctl_destroy(hwpt_id); +} + /* VFIO compatibility IOCTLs */ TEST_F(iommufd, simple_ioctls) @@ -1729,7 +2074,7 @@ TEST_F(vfio_compat_mock_domain, map) ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); ASSERT_EQ(BUFFER_SIZE, unmap_cmd.size); - /* UNMAP_FLAG_ALL requres 0 iova/size */ + /* UNMAP_FLAG_ALL requires 0 iova/size */ ASSERT_EQ(0, ioctl(self->fd, VFIO_IOMMU_MAP_DMA, &map_cmd)); unmap_cmd.flags = VFIO_DMA_UNMAP_FLAG_ALL; EXPECT_ERRNO(EINVAL, ioctl(self->fd, VFIO_IOMMU_UNMAP_DMA, &unmap_cmd)); diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c index a220ca2a689d..f590417cd67a 100644 --- a/tools/testing/selftests/iommu/iommufd_fail_nth.c +++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c @@ -105,7 +105,7 @@ static bool fail_nth_next(struct __test_metadata *_metadata, /* * This is just an arbitrary limit based on the current kernel - * situation. Changes in the kernel can dramtically change the number of + * situation. Changes in the kernel can dramatically change the number of * required fault injection sites, so if this hits it doesn't * necessarily mean a test failure, just that the limit has to be made * bigger. @@ -612,10 +612,11 @@ TEST_FAIL_NTH(basic_fail_nth, device) &idev_id)) return -1; - if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info))) + if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL)) return -1; - if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, &hwpt_id)) + if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id, + IOMMU_HWPT_DATA_NONE, 0, 0)) return -1; if (_test_cmd_mock_domain_replace(self->fd, stdev_id, ioas_id2, NULL)) diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index e0753d03ecaa..050e9751321c 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -16,6 +16,25 @@ /* Hack to make assertions more readable */ #define _IOMMU_TEST_CMD(x) IOMMU_TEST_CMD +/* Imported from include/asm-generic/bitops/generic-non-atomic.h */ +#define BITS_PER_BYTE 8 +#define BITS_PER_LONG __BITS_PER_LONG +#define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / __BITS_PER_LONG) + +static inline void set_bit(unsigned int nr, unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p |= mask; +} + +static inline bool test_bit(unsigned int nr, unsigned long *addr) +{ + return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG - 1))); +} + static void *buffer; static unsigned long BUFFER_SIZE; @@ -74,6 +93,38 @@ static int _test_cmd_mock_domain(int fd, unsigned int ioas_id, __u32 *stdev_id, EXPECT_ERRNO(_errno, _test_cmd_mock_domain(self->fd, ioas_id, \ stdev_id, hwpt_id, NULL)) +static int _test_cmd_mock_domain_flags(int fd, unsigned int ioas_id, + __u32 stdev_flags, __u32 *stdev_id, + __u32 *hwpt_id, __u32 *idev_id) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS, + .id = ioas_id, + .mock_domain_flags = { .dev_flags = stdev_flags }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret) + return ret; + if (stdev_id) + *stdev_id = cmd.mock_domain_flags.out_stdev_id; + assert(cmd.id != 0); + if (hwpt_id) + *hwpt_id = cmd.mock_domain_flags.out_hwpt_id; + if (idev_id) + *idev_id = cmd.mock_domain_flags.out_idev_id; + return 0; +} +#define test_cmd_mock_domain_flags(ioas_id, flags, stdev_id, hwpt_id, idev_id) \ + ASSERT_EQ(0, _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \ + stdev_id, hwpt_id, idev_id)) +#define test_err_mock_domain_flags(_errno, ioas_id, flags, stdev_id, hwpt_id) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_mock_domain_flags(self->fd, ioas_id, flags, \ + stdev_id, hwpt_id, NULL)) + static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, __u32 *hwpt_id) { @@ -103,12 +154,17 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id, pt_id, NULL)) static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, - __u32 *hwpt_id) + __u32 flags, __u32 *hwpt_id, __u32 data_type, + void *data, size_t data_len) { struct iommu_hwpt_alloc cmd = { .size = sizeof(cmd), + .flags = flags, .dev_id = device_id, .pt_id = pt_id, + .data_type = data_type, + .data_len = data_len, + .data_uptr = (uint64_t)data, }; int ret; @@ -120,8 +176,24 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, return 0; } -#define test_cmd_hwpt_alloc(device_id, pt_id, hwpt_id) \ - ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, hwpt_id)) +#define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ + 0)) +#define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ + EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, flags, \ + hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0)) + +#define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) +#define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \ + data_type, data, data_len) \ + EXPECT_ERRNO(_errno, \ + _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \ + hwpt_id, data_type, data, data_len)) static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, unsigned int ioas_id) @@ -142,6 +214,126 @@ static int _test_cmd_access_replace_ioas(int fd, __u32 access_id, #define test_cmd_access_replace_ioas(access_id, ioas_id) \ ASSERT_EQ(0, _test_cmd_access_replace_ioas(self->fd, access_id, ioas_id)) +static int _test_cmd_set_dirty_tracking(int fd, __u32 hwpt_id, bool enabled) +{ + struct iommu_hwpt_set_dirty_tracking cmd = { + .size = sizeof(cmd), + .flags = enabled ? IOMMU_HWPT_DIRTY_TRACKING_ENABLE : 0, + .hwpt_id = hwpt_id, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_SET_DIRTY_TRACKING, &cmd); + if (ret) + return -errno; + return 0; +} +#define test_cmd_set_dirty_tracking(hwpt_id, enabled) \ + ASSERT_EQ(0, _test_cmd_set_dirty_tracking(self->fd, hwpt_id, enabled)) + +static int _test_cmd_get_dirty_bitmap(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, + __u64 *bitmap, __u32 flags) +{ + struct iommu_hwpt_get_dirty_bitmap cmd = { + .size = sizeof(cmd), + .hwpt_id = hwpt_id, + .flags = flags, + .iova = iova, + .length = length, + .page_size = page_size, + .data = (uintptr_t)bitmap, + }; + int ret; + + ret = ioctl(fd, IOMMU_HWPT_GET_DIRTY_BITMAP, &cmd); + if (ret) + return ret; + return 0; +} + +#define test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, \ + bitmap, flags) \ + ASSERT_EQ(0, _test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, \ + page_size, bitmap, flags)) + +static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, + __u64 *bitmap, __u64 *dirty) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DIRTY, + .id = hwpt_id, + .dirty = { + .iova = iova, + .length = length, + .page_size = page_size, + .uptr = (uintptr_t)bitmap, + } + }; + int ret; + + ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_DIRTY), &cmd); + if (ret) + return -ret; + if (dirty) + *dirty = cmd.dirty.out_nr_dirty; + return 0; +} + +#define test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, \ + bitmap, nr) \ + ASSERT_EQ(0, \ + _test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, \ + page_size, bitmap, nr)) + +static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length, + __u64 iova, size_t page_size, __u64 *bitmap, + __u64 bitmap_size, __u32 flags, + struct __test_metadata *_metadata) +{ + unsigned long i, count, nbits = bitmap_size * BITS_PER_BYTE; + unsigned long nr = nbits / 2; + __u64 out_dirty = 0; + + /* Mark all even bits as dirty in the mock domain */ + for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) + if (!(i % 2)) + set_bit(i, (unsigned long *)bitmap); + ASSERT_EQ(nr, count); + + test_cmd_mock_domain_set_dirty(fd, hwpt_id, length, iova, page_size, + bitmap, &out_dirty); + ASSERT_EQ(nr, out_dirty); + + /* Expect all even bits as dirty in the user bitmap */ + memset(bitmap, 0, bitmap_size); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, + flags); + for (count = 0, i = 0; i < nbits; count += !(i % 2), i++) + ASSERT_EQ(!(i % 2), test_bit(i, (unsigned long *)bitmap)); + ASSERT_EQ(count, out_dirty); + + memset(bitmap, 0, bitmap_size); + test_cmd_get_dirty_bitmap(fd, hwpt_id, length, iova, page_size, bitmap, + flags); + + /* It as read already -- expect all zeroes */ + for (i = 0; i < nbits; i++) { + ASSERT_EQ(!(i % 2) && (flags & + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR), + test_bit(i, (unsigned long *)bitmap)); + } + + return 0; +} +#define test_mock_dirty_bitmaps(hwpt_id, length, iova, page_size, bitmap, \ + bitmap_size, flags, _metadata) \ + ASSERT_EQ(0, _test_mock_dirty_bitmaps(self->fd, hwpt_id, length, iova, \ + page_size, bitmap, bitmap_size, \ + flags, _metadata)) + static int _test_cmd_create_access(int fd, unsigned int ioas_id, __u32 *access_id, unsigned int flags) { @@ -266,6 +458,17 @@ static int _test_ioctl_ioas_map(int fd, unsigned int ioas_id, void *buffer, IOMMU_IOAS_MAP_READABLE)); \ }) +#define test_ioctl_ioas_map_fixed_id(ioas_id, buffer, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, \ + _test_ioctl_ioas_map( \ + self->fd, ioas_id, buffer, length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + #define test_err_ioctl_ioas_map_fixed(_errno, buffer, length, iova) \ ({ \ __u64 __iova = iova; \ @@ -354,8 +557,8 @@ static void teardown_iommufd(int fd, struct __test_metadata *_metadata) #endif /* @data can be NULL */ -static int _test_cmd_get_hw_info(int fd, __u32 device_id, - void *data, size_t data_len) +static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data, + size_t data_len, uint32_t *capabilities) { struct iommu_test_hw_info *info = (struct iommu_test_hw_info *)data; struct iommu_hw_info cmd = { @@ -363,6 +566,7 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, .dev_id = device_id, .data_len = data_len, .data_uptr = (uint64_t)data, + .out_capabilities = 0, }; int ret; @@ -399,14 +603,19 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, assert(!info->flags); } + if (capabilities) + *capabilities = cmd.out_capabilities; + return 0; } -#define test_cmd_get_hw_info(device_id, data, data_len) \ - ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, \ - data, data_len)) +#define test_cmd_get_hw_info(device_id, data, data_len) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, data, \ + data_len, NULL)) + +#define test_err_get_hw_info(_errno, device_id, data, data_len) \ + EXPECT_ERRNO(_errno, _test_cmd_get_hw_info(self->fd, device_id, data, \ + data_len, NULL)) -#define test_err_get_hw_info(_errno, device_id, data, data_len) \ - EXPECT_ERRNO(_errno, \ - _test_cmd_get_hw_info(self->fd, device_id, \ - data, data_len)) +#define test_cmd_get_hw_capabilities(device_id, caps, mask) \ + ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps)) |