summaryrefslogtreecommitdiff
path: root/drivers/iommu
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/iommu')
-rw-r--r--drivers/iommu/Kconfig4
-rw-r--r--drivers/iommu/amd/Kconfig1
-rw-r--r--drivers/iommu/amd/amd_iommu_types.h12
-rw-r--r--drivers/iommu/amd/io_pgtable.c68
-rw-r--r--drivers/iommu/amd/iommu.c147
-rw-r--r--drivers/iommu/intel/Kconfig1
-rw-r--r--drivers/iommu/intel/Makefile2
-rw-r--r--drivers/iommu/intel/iommu.c156
-rw-r--r--drivers/iommu/intel/iommu.h64
-rw-r--r--drivers/iommu/intel/nested.c117
-rw-r--r--drivers/iommu/intel/pasid.c221
-rw-r--r--drivers/iommu/intel/pasid.h6
-rw-r--r--drivers/iommu/iommufd/Makefile1
-rw-r--r--drivers/iommu/iommufd/device.c174
-rw-r--r--drivers/iommu/iommufd/hw_pagetable.c304
-rw-r--r--drivers/iommu/iommufd/io_pagetable.c200
-rw-r--r--drivers/iommu/iommufd/iommufd_private.h84
-rw-r--r--drivers/iommu/iommufd/iommufd_test.h39
-rw-r--r--drivers/iommu/iommufd/iova_bitmap.c426
-rw-r--r--drivers/iommu/iommufd/main.c17
-rw-r--r--drivers/iommu/iommufd/pages.c2
-rw-r--r--drivers/iommu/iommufd/selftest.c326
-rw-r--r--drivers/iommu/iommufd/vfio_compat.c6
23 files changed, 2200 insertions, 178 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 7f04491ca5f0..ee9e2a2edbf5 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -7,6 +7,10 @@ config IOMMU_IOVA
config IOMMU_API
bool
+config IOMMUFD_DRIVER
+ bool
+ default n
+
menuconfig IOMMU_SUPPORT
bool "IOMMU Hardware Support"
depends on MMU
diff --git a/drivers/iommu/amd/Kconfig b/drivers/iommu/amd/Kconfig
index 9b5fc3356bf2..8bd4c3b183ec 100644
--- a/drivers/iommu/amd/Kconfig
+++ b/drivers/iommu/amd/Kconfig
@@ -10,6 +10,7 @@ config AMD_IOMMU
select IOMMU_API
select IOMMU_IOVA
select IOMMU_IO_PGTABLE
+ select IOMMUFD_DRIVER if IOMMUFD
depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE
help
With this option you can enable support for AMD IOMMU hardware in
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index 7dc30c2b56b3..dec4e5c2b66b 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -97,7 +97,9 @@
#define FEATURE_GATS_MASK (3ULL)
#define FEATURE_GAM_VAPIC BIT_ULL(21)
#define FEATURE_GIOSUP BIT_ULL(48)
+#define FEATURE_HASUP BIT_ULL(49)
#define FEATURE_EPHSUP BIT_ULL(50)
+#define FEATURE_HDSUP BIT_ULL(52)
#define FEATURE_SNP BIT_ULL(63)
#define FEATURE_PASID_SHIFT 32
@@ -212,6 +214,7 @@
/* macros and definitions for device table entries */
#define DEV_ENTRY_VALID 0x00
#define DEV_ENTRY_TRANSLATION 0x01
+#define DEV_ENTRY_HAD 0x07
#define DEV_ENTRY_PPR 0x34
#define DEV_ENTRY_IR 0x3d
#define DEV_ENTRY_IW 0x3e
@@ -371,9 +374,15 @@
(1ULL << (12 + (9 * (level))))
/*
+ * The IOPTE dirty bit
+ */
+#define IOMMU_PTE_HD_BIT (6)
+
+/*
* Bit value definition for I/O PTE fields
*/
#define IOMMU_PTE_PR BIT_ULL(0)
+#define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT)
#define IOMMU_PTE_U BIT_ULL(59)
#define IOMMU_PTE_FC BIT_ULL(60)
#define IOMMU_PTE_IR BIT_ULL(61)
@@ -384,6 +393,7 @@
*/
#define DTE_FLAG_V BIT_ULL(0)
#define DTE_FLAG_TV BIT_ULL(1)
+#define DTE_FLAG_HAD (3ULL << 7)
#define DTE_FLAG_GIOV BIT_ULL(54)
#define DTE_FLAG_GV BIT_ULL(55)
#define DTE_GLX_SHIFT (56)
@@ -413,6 +423,7 @@
#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR)
+#define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD)
#define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK))
#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
@@ -563,6 +574,7 @@ struct protection_domain {
int nid; /* Node ID */
u64 *gcr3_tbl; /* Guest CR3 table */
unsigned long flags; /* flags to find out type of domain */
+ bool dirty_tracking; /* dirty tracking is enabled in the domain */
unsigned dev_cnt; /* devices assigned to this domain */
unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
};
diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c
index 2892aa1b4dc1..6c0621f6f572 100644
--- a/drivers/iommu/amd/io_pgtable.c
+++ b/drivers/iommu/amd/io_pgtable.c
@@ -486,6 +486,73 @@ static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned lo
return (__pte & ~offset_mask) | (iova & offset_mask);
}
+static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size,
+ unsigned long flags)
+{
+ bool test_only = flags & IOMMU_DIRTY_NO_CLEAR;
+ bool dirty = false;
+ int i, count;
+
+ /*
+ * 2.2.3.2 Host Dirty Support
+ * When a non-default page size is used , software must OR the
+ * Dirty bits in all of the replicated host PTEs used to map
+ * the page. The IOMMU does not guarantee the Dirty bits are
+ * set in all of the replicated PTEs. Any portion of the page
+ * may have been written even if the Dirty bit is set in only
+ * one of the replicated PTEs.
+ */
+ count = PAGE_SIZE_PTE_COUNT(size);
+ for (i = 0; i < count && test_only; i++) {
+ if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) {
+ dirty = true;
+ break;
+ }
+ }
+
+ for (i = 0; i < count && !test_only; i++) {
+ if (test_and_clear_bit(IOMMU_PTE_HD_BIT,
+ (unsigned long *)&ptep[i])) {
+ dirty = true;
+ }
+ }
+
+ return dirty;
+}
+
+static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops);
+ unsigned long end = iova + size - 1;
+
+ do {
+ unsigned long pgsize = 0;
+ u64 *ptep, pte;
+
+ ptep = fetch_pte(pgtable, iova, &pgsize);
+ if (ptep)
+ pte = READ_ONCE(*ptep);
+ if (!ptep || !IOMMU_PTE_PRESENT(pte)) {
+ pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0);
+ iova += pgsize;
+ continue;
+ }
+
+ /*
+ * Mark the whole IOVA range as dirty even if only one of
+ * the replicated PTEs were marked dirty.
+ */
+ if (pte_test_and_clear_dirty(ptep, pgsize, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
/*
* ----------------------------------------------------
*/
@@ -527,6 +594,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo
pgtable->iop.ops.map_pages = iommu_v1_map_pages;
pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages;
pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys;
+ pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty;
return &pgtable->iop;
}
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 95bd7c25ba6f..b399c5741378 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -37,6 +37,7 @@
#include <asm/iommu.h>
#include <asm/gart.h>
#include <asm/dma.h>
+#include <uapi/linux/iommufd.h>
#include "amd_iommu.h"
#include "../dma-iommu.h"
@@ -65,6 +66,7 @@ LIST_HEAD(hpet_map);
LIST_HEAD(acpihid_map);
const struct iommu_ops amd_iommu_ops;
+const struct iommu_dirty_ops amd_dirty_ops;
static ATOMIC_NOTIFIER_HEAD(ppr_notifier);
int amd_iommu_max_glx_val = -1;
@@ -1610,6 +1612,9 @@ static void set_dte_entry(struct amd_iommu *iommu, u16 devid,
pte_root |= 1ULL << DEV_ENTRY_PPR;
}
+ if (domain->dirty_tracking)
+ pte_root |= DTE_FLAG_HAD;
+
if (domain->flags & PD_IOMMUV2_MASK) {
u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl);
u64 glx = domain->glx;
@@ -2155,28 +2160,79 @@ static inline u64 dma_max_address(void)
return ((1ULL << PM_LEVEL_SHIFT(amd_iommu_gpt_level)) - 1);
}
-static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
+static bool amd_iommu_hd_support(struct amd_iommu *iommu)
{
+ return iommu && (iommu->features & FEATURE_HDSUP);
+}
+
+static struct iommu_domain *do_iommu_domain_alloc(unsigned int type,
+ struct device *dev, u32 flags)
+{
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
struct protection_domain *domain;
+ struct amd_iommu *iommu = NULL;
+
+ if (dev) {
+ iommu = rlookup_amd_iommu(dev);
+ if (!iommu)
+ return ERR_PTR(-ENODEV);
+ }
/*
* Since DTE[Mode]=0 is prohibited on SNP-enabled system,
* default to use IOMMU_DOMAIN_DMA[_FQ].
*/
if (amd_iommu_snp_en && (type == IOMMU_DOMAIN_IDENTITY))
- return NULL;
+ return ERR_PTR(-EINVAL);
+
+ if (dirty_tracking && !amd_iommu_hd_support(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
domain = protection_domain_alloc(type);
if (!domain)
- return NULL;
+ return ERR_PTR(-ENOMEM);
domain->domain.geometry.aperture_start = 0;
domain->domain.geometry.aperture_end = dma_max_address();
domain->domain.geometry.force_aperture = true;
+ if (iommu) {
+ domain->domain.type = type;
+ domain->domain.pgsize_bitmap = iommu->iommu.ops->pgsize_bitmap;
+ domain->domain.ops = iommu->iommu.ops->default_domain_ops;
+
+ if (dirty_tracking)
+ domain->domain.dirty_ops = &amd_dirty_ops;
+ }
+
return &domain->domain;
}
+static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
+{
+ struct iommu_domain *domain;
+
+ domain = do_iommu_domain_alloc(type, NULL, 0);
+ if (IS_ERR(domain))
+ return NULL;
+
+ return domain;
+}
+
+static struct iommu_domain *
+amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+
+{
+ unsigned int type = IOMMU_DOMAIN_UNMANAGED;
+
+ if ((flags & ~IOMMU_HWPT_ALLOC_DIRTY_TRACKING) || parent || user_data)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ return do_iommu_domain_alloc(type, dev, flags);
+}
+
static void amd_iommu_domain_free(struct iommu_domain *dom)
{
struct protection_domain *domain;
@@ -2214,6 +2270,13 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
dev_data->defer_attach = false;
+ /*
+ * Restrict to devices with compatible IOMMU hardware support
+ * when enforcement of dirty tracking is enabled.
+ */
+ if (dom->dirty_ops && !amd_iommu_hd_support(iommu))
+ return -EINVAL;
+
if (dev_data->domain)
detach_device(dev);
@@ -2332,6 +2395,11 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return true;
case IOMMU_CAP_DEFERRED_FLUSH:
return true;
+ case IOMMU_CAP_DIRTY_TRACKING: {
+ struct amd_iommu *iommu = rlookup_amd_iommu(dev);
+
+ return amd_iommu_hd_support(iommu);
+ }
default:
break;
}
@@ -2339,6 +2407,73 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap)
return false;
}
+static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct dev_table_entry *dev_table;
+ struct iommu_dev_data *dev_data;
+ bool domain_flush = false;
+ struct amd_iommu *iommu;
+ unsigned long flags;
+ u64 pte_root;
+
+ spin_lock_irqsave(&pdomain->lock, flags);
+ if (!(pdomain->dirty_tracking ^ enable)) {
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+ return 0;
+ }
+
+ list_for_each_entry(dev_data, &pdomain->dev_list, list) {
+ iommu = rlookup_amd_iommu(dev_data->dev);
+ if (!iommu)
+ continue;
+
+ dev_table = get_dev_table(iommu);
+ pte_root = dev_table[dev_data->devid].data[0];
+
+ pte_root = (enable ? pte_root | DTE_FLAG_HAD :
+ pte_root & ~DTE_FLAG_HAD);
+
+ /* Flush device DTE */
+ dev_table[dev_data->devid].data[0] = pte_root;
+ device_flush_dte(dev_data);
+ domain_flush = true;
+ }
+
+ /* Flush IOTLB to mark IOPTE dirty on the next translation(s) */
+ if (domain_flush) {
+ amd_iommu_domain_flush_tlb_pde(pdomain);
+ amd_iommu_domain_flush_complete(pdomain);
+ }
+ pdomain->dirty_tracking = enable;
+ spin_unlock_irqrestore(&pdomain->lock, flags);
+
+ return 0;
+}
+
+static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct protection_domain *pdomain = to_pdomain(domain);
+ struct io_pgtable_ops *ops = &pdomain->iop.iop.ops;
+ unsigned long lflags;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ spin_lock_irqsave(&pdomain->lock, lflags);
+ if (!pdomain->dirty_tracking && dirty->bitmap) {
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+ return -EINVAL;
+ }
+ spin_unlock_irqrestore(&pdomain->lock, lflags);
+
+ return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
static void amd_iommu_get_resv_regions(struct device *dev,
struct list_head *head)
{
@@ -2461,9 +2596,15 @@ static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain)
return true;
}
+const struct iommu_dirty_ops amd_dirty_ops = {
+ .set_dirty_tracking = amd_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = amd_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops amd_iommu_ops = {
.capable = amd_iommu_capable,
.domain_alloc = amd_iommu_domain_alloc,
+ .domain_alloc_user = amd_iommu_domain_alloc_user,
.probe_device = amd_iommu_probe_device,
.release_device = amd_iommu_release_device,
.probe_finalize = amd_iommu_probe_finalize,
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 119d2c57a48e..012cd2541a68 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -15,6 +15,7 @@ config INTEL_IOMMU
select DMA_OPS
select IOMMU_API
select IOMMU_IOVA
+ select IOMMUFD_DRIVER if IOMMUFD
select NEED_DMA_MAP_STATE
select DMAR_TABLE
select SWIOTLB
diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile
index 7af3b8a4f2a0..5dabf081a779 100644
--- a/drivers/iommu/intel/Makefile
+++ b/drivers/iommu/intel/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DMAR_TABLE) += dmar.o
-obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o
+obj-$(CONFIG_INTEL_IOMMU) += iommu.o pasid.o nested.o
obj-$(CONFIG_DMAR_TABLE) += trace.o cap_audit.o
obj-$(CONFIG_DMAR_PERF) += perf.o
obj-$(CONFIG_INTEL_IOMMU_DEBUGFS) += debugfs.o
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 3685ba90ec88..d1037280abf7 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -282,7 +282,6 @@ static LIST_HEAD(dmar_satc_units);
#define for_each_rmrr_units(rmrr) \
list_for_each_entry(rmrr, &dmar_rmrr_units, list)
-static void device_block_translation(struct device *dev);
static void intel_iommu_domain_free(struct iommu_domain *domain);
int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
@@ -300,6 +299,7 @@ static int iommu_skip_te_disable;
#define IDENTMAP_AZALIA 4
const struct iommu_ops intel_iommu_ops;
+const struct iommu_dirty_ops intel_dirty_ops;
static bool translation_pre_enabled(struct intel_iommu *iommu)
{
@@ -560,7 +560,7 @@ static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
}
/* Some capabilities may be different across iommus */
-static void domain_update_iommu_cap(struct dmar_domain *domain)
+void domain_update_iommu_cap(struct dmar_domain *domain)
{
domain_update_iommu_coherency(domain);
domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
@@ -1778,8 +1778,7 @@ static struct dmar_domain *alloc_domain(unsigned int type)
return domain;
}
-static int domain_attach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info, *curr;
unsigned long ndomains;
@@ -1828,8 +1827,7 @@ err_unlock:
return ret;
}
-static void domain_detach_iommu(struct dmar_domain *domain,
- struct intel_iommu *iommu)
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
{
struct iommu_domain_info *info;
@@ -2196,6 +2194,11 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
return -EINVAL;
+ if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
+ pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
+ return -EINVAL;
+ }
+
attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
attr |= DMA_FL_PTE_PRESENT;
if (domain->use_first_level) {
@@ -3958,7 +3961,7 @@ static void dmar_remove_one_dev_info(struct device *dev)
* all DMA requests without PASID from the device are blocked. If the page
* table has been set, clean up the data structures.
*/
-static void device_block_translation(struct device *dev)
+void device_block_translation(struct device *dev)
{
struct device_domain_info *info = dev_iommu_priv_get(dev);
struct intel_iommu *iommu = info->iommu;
@@ -4058,14 +4061,62 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
return NULL;
}
+static struct iommu_domain *
+intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
+ struct intel_iommu *iommu = info->iommu;
+ struct iommu_domain *domain;
+
+ /* Must be NESTING domain */
+ if (parent) {
+ if (!nested_supported(iommu) || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ return intel_nested_domain_alloc(parent, user_data);
+ }
+
+ if (flags &
+ (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (nested_parent && !nested_supported(iommu))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (dirty_tracking && !ssads_supported(iommu)))
+ return ERR_PTR(-EOPNOTSUPP);
+
+ /*
+ * domain_alloc_user op needs to fully initialize a domain before
+ * return, so uses iommu_domain_alloc() here for simple.
+ */
+ domain = iommu_domain_alloc(dev->bus);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ if (nested_parent)
+ to_dmar_domain(domain)->nested_parent = true;
+
+ if (dirty_tracking) {
+ if (to_dmar_domain(domain)->use_first_level) {
+ iommu_domain_free(domain);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+ domain->dirty_ops = &intel_dirty_ops;
+ }
+
+ return domain;
+}
+
static void intel_iommu_domain_free(struct iommu_domain *domain)
{
if (domain != &si_domain->domain && domain != &blocking_domain)
domain_exit(to_dmar_domain(domain));
}
-static int prepare_domain_attach_device(struct iommu_domain *domain,
- struct device *dev)
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev)
{
struct dmar_domain *dmar_domain = to_dmar_domain(domain);
struct intel_iommu *iommu;
@@ -4078,6 +4129,9 @@ static int prepare_domain_attach_device(struct iommu_domain *domain,
if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
return -EINVAL;
+ if (domain->dirty_ops && !ssads_supported(iommu))
+ return -EINVAL;
+
/* check if this iommu agaw is sufficient for max mapped address */
addr_width = agaw_to_width(iommu->agaw);
if (addr_width > cap_mgaw(iommu->cap))
@@ -4332,6 +4386,8 @@ static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
return dmar_platform_optin();
case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
return ecap_sc_support(info->iommu->ecap);
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return ssads_supported(info->iommu);
default:
return false;
}
@@ -4729,6 +4785,9 @@ static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
return -EOPNOTSUPP;
+ if (domain->dirty_ops)
+ return -EINVAL;
+
if (context_copied(iommu, info->bus, info->devfn))
return -EBUSY;
@@ -4780,6 +4839,7 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
if (!vtd)
return ERR_PTR(-ENOMEM);
+ vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
vtd->cap_reg = iommu->cap;
vtd->ecap_reg = iommu->ecap;
*length = sizeof(*vtd);
@@ -4787,10 +4847,88 @@ static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
return vtd;
}
+static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct device_domain_info *info;
+ int ret;
+
+ spin_lock(&dmar_domain->lock);
+ if (dmar_domain->dirty_tracking == enable)
+ goto out_unlock;
+
+ list_for_each_entry(info, &dmar_domain->devices, link) {
+ ret = intel_pasid_setup_dirty_tracking(info->iommu,
+ info->domain, info->dev,
+ IOMMU_NO_PASID, enable);
+ if (ret)
+ goto err_unwind;
+ }
+
+ dmar_domain->dirty_tracking = enable;
+out_unlock:
+ spin_unlock(&dmar_domain->lock);
+
+ return 0;
+
+err_unwind:
+ list_for_each_entry(info, &dmar_domain->devices, link)
+ intel_pasid_setup_dirty_tracking(info->iommu, dmar_domain,
+ info->dev, IOMMU_NO_PASID,
+ dmar_domain->dirty_tracking);
+ spin_unlock(&dmar_domain->lock);
+ return ret;
+}
+
+static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ unsigned long end = iova + size - 1;
+ unsigned long pgsize;
+
+ /*
+ * IOMMUFD core calls into a dirty tracking disabled domain without an
+ * IOVA bitmap set in order to clean dirty bits in all PTEs that might
+ * have occurred when we stopped dirty tracking. This ensures that we
+ * never inherit dirtied bits from a previous cycle.
+ */
+ if (!dmar_domain->dirty_tracking && dirty->bitmap)
+ return -EINVAL;
+
+ do {
+ struct dma_pte *pte;
+ int lvl = 0;
+
+ pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
+ GFP_ATOMIC);
+ pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
+ if (!pte || !dma_pte_present(pte)) {
+ iova += pgsize;
+ continue;
+ }
+
+ if (dma_sl_pte_test_and_clear_dirty(pte, flags))
+ iommu_dirty_bitmap_record(dirty, iova, pgsize);
+ iova += pgsize;
+ } while (iova < end);
+
+ return 0;
+}
+
+const struct iommu_dirty_ops intel_dirty_ops = {
+ .set_dirty_tracking = intel_iommu_set_dirty_tracking,
+ .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
+};
+
const struct iommu_ops intel_iommu_ops = {
.capable = intel_iommu_capable,
.hw_info = intel_iommu_hw_info,
.domain_alloc = intel_iommu_domain_alloc,
+ .domain_alloc_user = intel_iommu_domain_alloc_user,
.probe_device = intel_iommu_probe_device,
.probe_finalize = intel_iommu_probe_finalize,
.release_device = intel_iommu_release_device,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 7dac94f62b4e..d796d0d9b114 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -25,6 +25,7 @@
#include <asm/cacheflush.h>
#include <asm/iommu.h>
+#include <uapi/linux/iommufd.h>
/*
* VT-d hardware uses 4KiB page size regardless of host page size.
@@ -48,6 +49,9 @@
#define DMA_FL_PTE_DIRTY BIT_ULL(6)
#define DMA_FL_PTE_XD BIT_ULL(63)
+#define DMA_SL_PTE_DIRTY_BIT 9
+#define DMA_SL_PTE_DIRTY BIT_ULL(DMA_SL_PTE_DIRTY_BIT)
+
#define ADDR_WIDTH_5LEVEL (57)
#define ADDR_WIDTH_4LEVEL (48)
@@ -539,6 +543,10 @@ enum {
#define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
#define pasid_supported(iommu) (sm_supported(iommu) && \
ecap_pasid((iommu)->ecap))
+#define ssads_supported(iommu) (sm_supported(iommu) && \
+ ecap_slads((iommu)->ecap))
+#define nested_supported(iommu) (sm_supported(iommu) && \
+ ecap_nest((iommu)->ecap))
struct pasid_entry;
struct pasid_state_entry;
@@ -592,20 +600,45 @@ struct dmar_domain {
* otherwise, goes through the second
* level.
*/
+ u8 dirty_tracking:1; /* Dirty tracking is enabled */
+ u8 nested_parent:1; /* Has other domains nested on it */
spinlock_t lock; /* Protect device tracking lists */
struct list_head devices; /* all devices' list */
struct list_head dev_pasids; /* all attached pasids */
- struct dma_pte *pgd; /* virtual address */
- int gaw; /* max guest address width */
-
- /* adjusted guest address width, 0 is level 2 30-bit */
- int agaw;
int iommu_superpage;/* Level of superpages supported:
0 == 4KiB (no superpages), 1 == 2MiB,
2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
- u64 max_addr; /* maximum mapped address */
+ union {
+ /* DMA remapping domain */
+ struct {
+ /* virtual address */
+ struct dma_pte *pgd;
+ /* max guest address width */
+ int gaw;
+ /*
+ * adjusted guest address width:
+ * 0: level 2 30-bit
+ * 1: level 3 39-bit
+ * 2: level 4 48-bit
+ * 3: level 5 57-bit
+ */
+ int agaw;
+ /* maximum mapped address */
+ u64 max_addr;
+ };
+
+ /* Nested user domain */
+ struct {
+ /* parent page table which the user domain is nested on */
+ struct dmar_domain *s2_domain;
+ /* user page table pointer (in GPA) */
+ unsigned long s1_pgtbl;
+ /* page table attributes */
+ struct iommu_hwpt_vtd_s1 s1_cfg;
+ };
+ };
struct iommu_domain domain; /* generic domain data structure for
iommu core */
@@ -781,6 +814,16 @@ static inline bool dma_pte_present(struct dma_pte *pte)
return (pte->val & 3) != 0;
}
+static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
+ unsigned long flags)
+{
+ if (flags & IOMMU_DIRTY_NO_CLEAR)
+ return (pte->val & DMA_SL_PTE_DIRTY) != 0;
+
+ return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT,
+ (unsigned long *)&pte->val);
+}
+
static inline bool dma_pte_superpage(struct dma_pte *pte)
{
return (pte->val & DMA_PTE_LARGE_PAGE);
@@ -836,12 +879,21 @@ int qi_submit_sync(struct intel_iommu *iommu, struct qi_desc *desc,
*/
#define QI_OPT_WAIT_DRAIN BIT(0)
+int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu);
+void device_block_translation(struct device *dev);
+int prepare_domain_attach_device(struct iommu_domain *domain,
+ struct device *dev);
+void domain_update_iommu_cap(struct dmar_domain *domain);
+
int dmar_ir_support(void);
void *alloc_pgtable_page(int node, gfp_t gfp);
void free_pgtable_page(void *vaddr);
void iommu_flush_write_buffer(struct intel_iommu *iommu);
struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn);
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data);
#ifdef CONFIG_INTEL_IOMMU_SVM
void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c
new file mode 100644
index 000000000000..b5a5563ab32c
--- /dev/null
+++ b/drivers/iommu/intel/nested.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * nested.c - nested mode translation support
+ *
+ * Copyright (C) 2023 Intel Corporation
+ *
+ * Author: Lu Baolu <baolu.lu@linux.intel.com>
+ * Jacob Pan <jacob.jun.pan@linux.intel.com>
+ * Yi Liu <yi.l.liu@intel.com>
+ */
+
+#define pr_fmt(fmt) "DMAR: " fmt
+
+#include <linux/iommu.h>
+#include <linux/pci.h>
+#include <linux/pci-ats.h>
+
+#include "iommu.h"
+#include "pasid.h"
+
+static int intel_nested_attach_dev(struct iommu_domain *domain,
+ struct device *dev)
+{
+ struct device_domain_info *info = dev_iommu_priv_get(dev);
+ struct dmar_domain *dmar_domain = to_dmar_domain(domain);
+ struct intel_iommu *iommu = info->iommu;
+ unsigned long flags;
+ int ret = 0;
+
+ if (info->domain)
+ device_block_translation(dev);
+
+ if (iommu->agaw < dmar_domain->s2_domain->agaw) {
+ dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n");
+ return -ENODEV;
+ }
+
+ /*
+ * Stage-1 domain cannot work alone, it is nested on a s2_domain.
+ * The s2_domain will be used in nested translation, hence needs
+ * to ensure the s2_domain is compatible with this IOMMU.
+ */
+ ret = prepare_domain_attach_device(&dmar_domain->s2_domain->domain, dev);
+ if (ret) {
+ dev_err_ratelimited(dev, "s2 domain is not compatible\n");
+ return ret;
+ }
+
+ ret = domain_attach_iommu(dmar_domain, iommu);
+ if (ret) {
+ dev_err_ratelimited(dev, "Failed to attach domain to iommu\n");
+ return ret;
+ }
+
+ ret = intel_pasid_setup_nested(iommu, dev,
+ IOMMU_NO_PASID, dmar_domain);
+ if (ret) {
+ domain_detach_iommu(dmar_domain, iommu);
+ dev_err_ratelimited(dev, "Failed to setup pasid entry\n");
+ return ret;
+ }
+
+ info->domain = dmar_domain;
+ spin_lock_irqsave(&dmar_domain->lock, flags);
+ list_add(&info->link, &dmar_domain->devices);
+ spin_unlock_irqrestore(&dmar_domain->lock, flags);
+
+ return 0;
+}
+
+static void intel_nested_domain_free(struct iommu_domain *domain)
+{
+ kfree(to_dmar_domain(domain));
+}
+
+static const struct iommu_domain_ops intel_nested_domain_ops = {
+ .attach_dev = intel_nested_attach_dev,
+ .free = intel_nested_domain_free,
+};
+
+struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct dmar_domain *s2_domain = to_dmar_domain(parent);
+ struct iommu_hwpt_vtd_s1 vtd;
+ struct dmar_domain *domain;
+ int ret;
+
+ /* Must be nested domain */
+ if (user_data->type != IOMMU_HWPT_DATA_VTD_S1)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->ops != intel_iommu_ops.default_domain_ops ||
+ !s2_domain->nested_parent)
+ return ERR_PTR(-EINVAL);
+
+ ret = iommu_copy_struct_from_user(&vtd, user_data,
+ IOMMU_HWPT_DATA_VTD_S1, __reserved);
+ if (ret)
+ return ERR_PTR(ret);
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL_ACCOUNT);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+
+ domain->use_first_level = true;
+ domain->s2_domain = s2_domain;
+ domain->s1_pgtbl = vtd.pgtbl_addr;
+ domain->s1_cfg = vtd;
+ domain->domain.ops = &intel_nested_domain_ops;
+ domain->domain.type = IOMMU_DOMAIN_NESTED;
+ INIT_LIST_HEAD(&domain->devices);
+ INIT_LIST_HEAD(&domain->dev_pasids);
+ spin_lock_init(&domain->lock);
+ xa_init(&domain->iommu_array);
+
+ return &domain->domain;
+}
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 8f92b92f3d2a..74e8e4c17e81 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -277,6 +277,11 @@ static inline void pasid_set_bits(u64 *ptr, u64 mask, u64 bits)
WRITE_ONCE(*ptr, (old & ~mask) | bits);
}
+static inline u64 pasid_get_bits(u64 *ptr)
+{
+ return READ_ONCE(*ptr);
+}
+
/*
* Setup the DID(Domain Identifier) field (Bit 64~79) of scalable mode
* PASID entry.
@@ -336,6 +341,45 @@ static inline void pasid_set_fault_enable(struct pasid_entry *pe)
}
/*
+ * Enable second level A/D bits by setting the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_set_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 1 << 9);
+}
+
+/*
+ * Disable second level A/D bits by clearing the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry.
+ */
+static inline void pasid_clear_ssade(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[0], 1 << 9, 0);
+}
+
+/*
+ * Checks if second level A/D bits specifically the SLADE (Second Level
+ * Access Dirty Enable) field (Bit 9) of a scalable mode PASID
+ * entry is set.
+ */
+static inline bool pasid_get_ssade(struct pasid_entry *pe)
+{
+ return pasid_get_bits(&pe->val[0]) & (1 << 9);
+}
+
+/*
+ * Setup the SRE(Supervisor Request Enable) field (Bit 128) of a
+ * scalable mode PASID entry.
+ */
+static inline void pasid_set_sre(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 0, 1);
+}
+
+/*
* Setup the WPE(Write Protect Enable) field (Bit 132) of a
* scalable mode PASID entry.
*/
@@ -402,6 +446,15 @@ pasid_set_flpm(struct pasid_entry *pe, u64 value)
pasid_set_bits(&pe->val[2], GENMASK_ULL(3, 2), value << 2);
}
+/*
+ * Setup the Extended Access Flag Enable (EAFE) field (Bit 135)
+ * of a scalable mode PASID entry.
+ */
+static inline void pasid_set_eafe(struct pasid_entry *pe)
+{
+ pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
+}
+
static void
pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
u16 did, u32 pasid)
@@ -627,6 +680,8 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY);
pasid_set_fault_enable(pte);
pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ if (domain->dirty_tracking)
+ pasid_set_ssade(pte);
pasid_set_present(pte);
spin_unlock(&iommu->lock);
@@ -637,6 +692,78 @@ int intel_pasid_setup_second_level(struct intel_iommu *iommu,
}
/*
+ * Set up dirty tracking on a second only or nested translation type.
+ */
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled)
+{
+ struct pasid_entry *pte;
+ u16 did, pgtt;
+
+ spin_lock(&iommu->lock);
+
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev, "Failed to get pasid entry of PASID %d\n", pasid);
+ return -ENODEV;
+ }
+
+ did = domain_id_iommu(domain, iommu);
+ pgtt = pasid_pte_get_pgtt(pte);
+ if (pgtt != PASID_ENTRY_PGTT_SL_ONLY &&
+ pgtt != PASID_ENTRY_PGTT_NESTED) {
+ spin_unlock(&iommu->lock);
+ dev_err_ratelimited(
+ dev,
+ "Dirty tracking not supported on translation type %d\n",
+ pgtt);
+ return -EOPNOTSUPP;
+ }
+
+ if (pasid_get_ssade(pte) == enabled) {
+ spin_unlock(&iommu->lock);
+ return 0;
+ }
+
+ if (enabled)
+ pasid_set_ssade(pte);
+ else
+ pasid_clear_ssade(pte);
+ spin_unlock(&iommu->lock);
+
+ if (!ecap_coherent(iommu->ecap))
+ clflush_cache_range(pte, sizeof(*pte));
+
+ /*
+ * From VT-d spec table 25 "Guidance to Software for Invalidations":
+ *
+ * - PASID-selective-within-Domain PASID-cache invalidation
+ * If (PGTT=SS or Nested)
+ * - Domain-selective IOTLB invalidation
+ * Else
+ * - PASID-selective PASID-based IOTLB invalidation
+ * - If (pasid is RID_PASID)
+ * - Global Device-TLB invalidation to affected functions
+ * Else
+ * - PASID-based Device-TLB invalidation (with S=1 and
+ * Addr[63:12]=0x7FFFFFFF_FFFFF) to affected functions
+ */
+ pasid_cache_invalidation_with_pasid(iommu, did, pasid);
+
+ iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+ /* Device IOTLB doesn't need to be flushed in caching mode. */
+ if (!cap_caching_mode(iommu->cap))
+ devtlb_invalidation_with_pasid(iommu, dev, pasid);
+
+ return 0;
+}
+
+/*
* Set up the scalable mode pasid entry for passthrough translation type.
*/
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
@@ -713,3 +840,97 @@ void intel_pasid_setup_page_snoop_control(struct intel_iommu *iommu,
if (!cap_caching_mode(iommu->cap))
devtlb_invalidation_with_pasid(iommu, dev, pasid);
}
+
+/**
+ * intel_pasid_setup_nested() - Set up PASID entry for nested translation.
+ * @iommu: IOMMU which the device belong to
+ * @dev: Device to be set up for translation
+ * @pasid: PASID to be programmed in the device PASID table
+ * @domain: User stage-1 domain nested on a stage-2 domain
+ *
+ * This is used for nested translation. The input domain should be
+ * nested type and nested on a parent with 'is_nested_parent' flag
+ * set.
+ */
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain)
+{
+ struct iommu_hwpt_vtd_s1 *s1_cfg = &domain->s1_cfg;
+ pgd_t *s1_gpgd = (pgd_t *)(uintptr_t)domain->s1_pgtbl;
+ struct dmar_domain *s2_domain = domain->s2_domain;
+ u16 did = domain_id_iommu(domain, iommu);
+ struct dma_pte *pgd = s2_domain->pgd;
+ struct pasid_entry *pte;
+
+ /* Address width should match the address width supported by hardware */
+ switch (s1_cfg->addr_width) {
+ case ADDR_WIDTH_4LEVEL:
+ break;
+ case ADDR_WIDTH_5LEVEL:
+ if (!cap_fl5lp_support(iommu->cap)) {
+ dev_err_ratelimited(dev,
+ "5-level paging not supported\n");
+ return -EINVAL;
+ }
+ break;
+ default:
+ dev_err_ratelimited(dev, "Invalid stage-1 address width %d\n",
+ s1_cfg->addr_width);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_SRE) && !ecap_srs(iommu->ecap)) {
+ pr_err_ratelimited("No supervisor request support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ if ((s1_cfg->flags & IOMMU_VTD_S1_EAFE) && !ecap_eafs(iommu->ecap)) {
+ pr_err_ratelimited("No extended access flag support on %s\n",
+ iommu->name);
+ return -EINVAL;
+ }
+
+ spin_lock(&iommu->lock);
+ pte = intel_pasid_get_entry(dev, pasid);
+ if (!pte) {
+ spin_unlock(&iommu->lock);
+ return -ENODEV;
+ }
+ if (pasid_pte_is_present(pte)) {
+ spin_unlock(&iommu->lock);
+ return -EBUSY;
+ }
+
+ pasid_clear_entry(pte);
+
+ if (s1_cfg->addr_width == ADDR_WIDTH_5LEVEL)
+ pasid_set_flpm(pte, 1);
+
+ pasid_set_flptr(pte, (uintptr_t)s1_gpgd);
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_SRE) {
+ pasid_set_sre(pte);
+ if (s1_cfg->flags & IOMMU_VTD_S1_WPE)
+ pasid_set_wpe(pte);
+ }
+
+ if (s1_cfg->flags & IOMMU_VTD_S1_EAFE)
+ pasid_set_eafe(pte);
+
+ if (s2_domain->force_snooping)
+ pasid_set_pgsnp(pte);
+
+ pasid_set_slptr(pte, virt_to_phys(pgd));
+ pasid_set_fault_enable(pte);
+ pasid_set_domain_id(pte, did);
+ pasid_set_address_width(pte, s2_domain->agaw);
+ pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap));
+ pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);
+ pasid_set_present(pte);
+ spin_unlock(&iommu->lock);
+
+ pasid_flush_caches(iommu, pte, pasid, did);
+
+ return 0;
+}
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 4e9e68c3c388..dd37611175cc 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -106,9 +106,15 @@ int intel_pasid_setup_first_level(struct intel_iommu *iommu,
int intel_pasid_setup_second_level(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_dirty_tracking(struct intel_iommu *iommu,
+ struct dmar_domain *domain,
+ struct device *dev, u32 pasid,
+ bool enabled);
int intel_pasid_setup_pass_through(struct intel_iommu *iommu,
struct dmar_domain *domain,
struct device *dev, u32 pasid);
+int intel_pasid_setup_nested(struct intel_iommu *iommu, struct device *dev,
+ u32 pasid, struct dmar_domain *domain);
void intel_pasid_tear_down_entry(struct intel_iommu *iommu,
struct device *dev, u32 pasid,
bool fault_ignore);
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 8aeba81800c5..34b446146961 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -11,3 +11,4 @@ iommufd-y := \
iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
obj-$(CONFIG_IOMMUFD) += iommufd.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index ce78c3671539..59d3a07300d9 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -293,7 +293,7 @@ u32 iommufd_device_to_id(struct iommufd_device *idev)
EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD);
static int iommufd_group_setup_msi(struct iommufd_group *igroup,
- struct iommufd_hw_pagetable *hwpt)
+ struct iommufd_hwpt_paging *hwpt_paging)
{
phys_addr_t sw_msi_start = igroup->sw_msi_start;
int rc;
@@ -311,8 +311,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* matches what the IRQ layer actually expects in a newly created
* domain.
*/
- if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) {
- rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start);
+ if (sw_msi_start != PHYS_ADDR_MAX && !hwpt_paging->msi_cookie) {
+ rc = iommu_get_msi_cookie(hwpt_paging->common.domain,
+ sw_msi_start);
if (rc)
return rc;
@@ -320,7 +321,31 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
* iommu_get_msi_cookie() can only be called once per domain,
* it returns -EBUSY on later calls.
*/
- hwpt->msi_cookie = true;
+ hwpt_paging->msi_cookie = true;
+ }
+ return 0;
+}
+
+static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
+ struct iommufd_device *idev)
+{
+ int rc;
+
+ lockdep_assert_held(&idev->igroup->lock);
+
+ rc = iopt_table_enforce_dev_resv_regions(&hwpt_paging->ioas->iopt,
+ idev->dev,
+ &idev->igroup->sw_msi_start);
+ if (rc)
+ return rc;
+
+ if (list_empty(&idev->igroup->device_list)) {
+ rc = iommufd_group_setup_msi(idev->igroup, hwpt_paging);
+ if (rc) {
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt,
+ idev->dev);
+ return rc;
+ }
}
return 0;
}
@@ -337,18 +362,12 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
goto err_unlock;
}
- /* Try to upgrade the domain we have */
- if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
if (rc)
goto err_unlock;
}
- rc = iopt_table_enforce_dev_resv_regions(&hwpt->ioas->iopt, idev->dev,
- &idev->igroup->sw_msi_start);
- if (rc)
- goto err_unlock;
-
/*
* Only attach to the group once for the first device that is in the
* group. All the other devices will follow this attachment. The user
@@ -357,10 +376,6 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
* attachment.
*/
if (list_empty(&idev->igroup->device_list)) {
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
if (rc)
goto err_unresv;
@@ -371,7 +386,9 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
mutex_unlock(&idev->igroup->lock);
return 0;
err_unresv:
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
err_unlock:
mutex_unlock(&idev->igroup->lock);
return rc;
@@ -388,7 +405,9 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
iommu_detach_group(hwpt->domain, idev->igroup->group);
idev->igroup->hwpt = NULL;
}
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev);
+ if (hwpt_is_paging(hwpt))
+ iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
+ idev->dev);
mutex_unlock(&idev->igroup->lock);
/* Caller must destroy hwpt */
@@ -407,14 +426,55 @@ iommufd_device_do_attach(struct iommufd_device *idev,
return NULL;
}
+static void
+iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_device *cur;
+
+ lockdep_assert_held(&igroup->lock);
+
+ list_for_each_entry(cur, &igroup->device_list, group_item)
+ iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, cur->dev);
+}
+
+static int
+iommufd_group_do_replace_paging(struct iommufd_group *igroup,
+ struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+ struct iommufd_device *cur;
+ int rc;
+
+ lockdep_assert_held(&igroup->lock);
+
+ if (!hwpt_is_paging(old_hwpt) ||
+ hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
+ list_for_each_entry(cur, &igroup->device_list, group_item) {
+ rc = iopt_table_enforce_dev_resv_regions(
+ &hwpt_paging->ioas->iopt, cur->dev, NULL);
+ if (rc)
+ goto err_unresv;
+ }
+ }
+
+ rc = iommufd_group_setup_msi(igroup, hwpt_paging);
+ if (rc)
+ goto err_unresv;
+ return 0;
+
+err_unresv:
+ iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
+ return rc;
+}
+
static struct iommufd_hw_pagetable *
iommufd_device_do_replace(struct iommufd_device *idev,
struct iommufd_hw_pagetable *hwpt)
{
struct iommufd_group *igroup = idev->igroup;
struct iommufd_hw_pagetable *old_hwpt;
- unsigned int num_devices = 0;
- struct iommufd_device *cur;
+ unsigned int num_devices;
int rc;
mutex_lock(&idev->igroup->lock);
@@ -429,42 +489,27 @@ iommufd_device_do_replace(struct iommufd_device *idev,
return NULL;
}
- /* Try to upgrade the domain we have */
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- num_devices++;
- if (cur->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
- if (rc)
- goto err_unlock;
- }
- }
-
old_hwpt = igroup->hwpt;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item) {
- rc = iopt_table_enforce_dev_resv_regions(
- &hwpt->ioas->iopt, cur->dev, NULL);
- if (rc)
- goto err_unresv;
- }
+ if (hwpt_is_paging(hwpt)) {
+ rc = iommufd_group_do_replace_paging(igroup,
+ to_hwpt_paging(hwpt));
+ if (rc)
+ goto err_unlock;
}
- rc = iommufd_group_setup_msi(idev->igroup, hwpt);
- if (rc)
- goto err_unresv;
-
rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
if (rc)
goto err_unresv;
- if (hwpt->ioas != old_hwpt->ioas) {
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&old_hwpt->ioas->iopt,
- cur->dev);
- }
+ if (hwpt_is_paging(old_hwpt) &&
+ (!hwpt_is_paging(hwpt) ||
+ to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
igroup->hwpt = hwpt;
+ num_devices = list_count_nodes(&igroup->device_list);
/*
* Move the refcounts held by the device_list to the new hwpt. Retain a
* refcount for this thread as the caller will free it.
@@ -478,8 +523,9 @@ iommufd_device_do_replace(struct iommufd_device *idev,
/* Caller must destroy old_hwpt */
return old_hwpt;
err_unresv:
- list_for_each_entry(cur, &igroup->device_list, group_item)
- iopt_remove_reserved_iova(&hwpt->ioas->iopt, cur->dev);
+ if (hwpt_is_paging(hwpt))
+ iommufd_group_remove_reserved_iova(igroup,
+ to_hwpt_paging(old_hwpt));
err_unlock:
mutex_unlock(&idev->igroup->lock);
return ERR_PTR(rc);
@@ -507,6 +553,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
*/
bool immediate_attach = do_attach == iommufd_device_do_attach;
struct iommufd_hw_pagetable *destroy_hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
/*
@@ -515,10 +562,11 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
* other.
*/
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->auto_domain)
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->auto_domain)
continue;
+ hwpt = &hwpt_paging->common;
if (!iommufd_lock_obj(&hwpt->obj))
continue;
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -539,12 +587,13 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
goto out_unlock;
}
- hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev,
- immediate_attach);
- if (IS_ERR(hwpt)) {
- destroy_hwpt = ERR_CAST(hwpt);
+ hwpt_paging = iommufd_hwpt_paging_alloc(idev->ictx, ioas, idev, 0,
+ immediate_attach, NULL);
+ if (IS_ERR(hwpt_paging)) {
+ destroy_hwpt = ERR_CAST(hwpt_paging);
goto out_unlock;
}
+ hwpt = &hwpt_paging->common;
if (!immediate_attach) {
destroy_hwpt = (*do_attach)(idev, hwpt);
@@ -554,7 +603,7 @@ iommufd_device_auto_get_domain(struct iommufd_device *idev,
destroy_hwpt = NULL;
}
- hwpt->auto_domain = true;
+ hwpt_paging->auto_domain = true;
*pt_id = hwpt->obj.id;
iommufd_object_finalize(idev->ictx, &hwpt->obj);
@@ -579,7 +628,8 @@ static int iommufd_device_change_pt(struct iommufd_device *idev, u32 *pt_id,
return PTR_ERR(pt_obj);
switch (pt_obj->type) {
- case IOMMUFD_OBJ_HW_PAGETABLE: {
+ case IOMMUFD_OBJ_HWPT_NESTED:
+ case IOMMUFD_OBJ_HWPT_PAGING: {
struct iommufd_hw_pagetable *hwpt =
container_of(pt_obj, struct iommufd_hw_pagetable, obj);
@@ -617,8 +667,8 @@ out_put_pt_obj:
/**
* iommufd_device_attach - Connect a device to an iommu_domain
* @idev: device to attach
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This connects the device to an iommu_domain, either automatically or manually
* selected. Once this completes the device could do DMA.
@@ -646,8 +696,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD);
/**
* iommufd_device_replace - Change the device's iommu_domain
* @idev: device to change
- * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE
- * Output the IOMMUFD_OBJ_HW_PAGETABLE ID
+ * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HWPT_PAGING
+ * Output the IOMMUFD_OBJ_HWPT_PAGING ID
*
* This is the same as::
*
@@ -1185,6 +1235,10 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd)
*/
cmd->data_len = data_len;
+ cmd->out_capabilities = 0;
+ if (device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+ cmd->out_capabilities |= IOMMU_HW_CAP_DIRTY_TRACKING;
+
rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
out_free:
kfree(data);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index cf2c1504e20d..2abbeafdbd22 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -5,62 +5,87 @@
#include <linux/iommu.h>
#include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
#include "iommufd_private.h"
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj)
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
- if (!list_empty(&hwpt->hwpt_item)) {
- mutex_lock(&hwpt->ioas->mutex);
- list_del(&hwpt->hwpt_item);
- mutex_unlock(&hwpt->ioas->mutex);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ mutex_lock(&hwpt_paging->ioas->mutex);
+ list_del(&hwpt_paging->hwpt_item);
+ mutex_unlock(&hwpt_paging->ioas->mutex);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- if (hwpt->domain)
- iommu_domain_free(hwpt->domain);
+ if (hwpt_paging->common.domain)
+ iommu_domain_free(hwpt_paging->common.domain);
- refcount_dec(&hwpt->ioas->obj.users);
+ refcount_dec(&hwpt_paging->ioas->obj.users);
}
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj)
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj)
{
- struct iommufd_hw_pagetable *hwpt =
- container_of(obj, struct iommufd_hw_pagetable, obj);
+ struct iommufd_hwpt_paging *hwpt_paging =
+ container_of(obj, struct iommufd_hwpt_paging, common.obj);
/* The ioas->mutex must be held until finalize is called. */
- lockdep_assert_held(&hwpt->ioas->mutex);
+ lockdep_assert_held(&hwpt_paging->ioas->mutex);
- if (!list_empty(&hwpt->hwpt_item)) {
- list_del_init(&hwpt->hwpt_item);
- iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain);
+ if (!list_empty(&hwpt_paging->hwpt_item)) {
+ list_del_init(&hwpt_paging->hwpt_item);
+ iopt_table_remove_domain(&hwpt_paging->ioas->iopt,
+ hwpt_paging->common.domain);
}
- iommufd_hw_pagetable_destroy(obj);
+ iommufd_hwpt_paging_destroy(obj);
}
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
{
- if (hwpt->enforce_cache_coherency)
+ struct iommufd_hwpt_nested *hwpt_nested =
+ container_of(obj, struct iommufd_hwpt_nested, common.obj);
+
+ if (hwpt_nested->common.domain)
+ iommu_domain_free(hwpt_nested->common.domain);
+
+ refcount_dec(&hwpt_nested->parent->common.obj.users);
+}
+
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj)
+{
+ iommufd_hwpt_nested_destroy(obj);
+}
+
+static int
+iommufd_hwpt_paging_enforce_cc(struct iommufd_hwpt_paging *hwpt_paging)
+{
+ struct iommu_domain *paging_domain = hwpt_paging->common.domain;
+
+ if (hwpt_paging->enforce_cache_coherency)
return 0;
- if (hwpt->domain->ops->enforce_cache_coherency)
- hwpt->enforce_cache_coherency =
- hwpt->domain->ops->enforce_cache_coherency(
- hwpt->domain);
- if (!hwpt->enforce_cache_coherency)
+ if (paging_domain->ops->enforce_cache_coherency)
+ hwpt_paging->enforce_cache_coherency =
+ paging_domain->ops->enforce_cache_coherency(
+ paging_domain);
+ if (!hwpt_paging->enforce_cache_coherency)
return -EINVAL;
return 0;
}
/**
- * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device
+ * iommufd_hwpt_paging_alloc() - Get a PAGING iommu_domain for a device
* @ictx: iommufd context
* @ioas: IOAS to associate the domain with
* @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
* @immediate_attach: True if idev should be attached to the hwpt
+ * @user_data: The user provided driver specific data describing the domain to
+ * create
*
* Allocate a new iommu_domain and return it as a hw_pagetable. The HWPT
* will be linked to the given ioas and upon return the underlying iommu_domain
@@ -70,28 +95,52 @@ int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt)
* iommufd_object_abort_and_destroy() or iommufd_object_finalize() is called on
* the returned hwpt.
*/
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach)
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data)
{
+ const u32 valid_flags = IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_hw_pagetable *hwpt;
int rc;
lockdep_assert_held(&ioas->mutex);
- hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE);
- if (IS_ERR(hwpt))
- return hwpt;
+ if ((flags || user_data) && !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (flags & ~valid_flags)
+ return ERR_PTR(-EOPNOTSUPP);
+
+ hwpt_paging = __iommufd_object_alloc(
+ ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
+ if (IS_ERR(hwpt_paging))
+ return ERR_CAST(hwpt_paging);
+ hwpt = &hwpt_paging->common;
- INIT_LIST_HEAD(&hwpt->hwpt_item);
+ INIT_LIST_HEAD(&hwpt_paging->hwpt_item);
/* Pairs with iommufd_hw_pagetable_destroy() */
refcount_inc(&ioas->obj.users);
- hwpt->ioas = ioas;
+ hwpt_paging->ioas = ioas;
+ hwpt_paging->nest_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
- hwpt->domain = iommu_domain_alloc(idev->dev->bus);
- if (!hwpt->domain) {
- rc = -ENOMEM;
- goto out_abort;
+ if (ops->domain_alloc_user) {
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
+ user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+ } else {
+ hwpt->domain = iommu_domain_alloc(idev->dev->bus);
+ if (!hwpt->domain) {
+ rc = -ENOMEM;
+ goto out_abort;
+ }
}
/*
@@ -100,9 +149,16 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
* doing any maps. It is an iommu driver bug to report
* IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail enforce_cache_coherency on
* a new domain.
+ *
+ * The cache coherency mode must be configured here and unchanged later.
+ * Note that a HWPT (non-CC) created for a device (non-CC) can be later
+ * reused by another device (either non-CC or CC). However, A HWPT (CC)
+ * created for a device (CC) cannot be reused by another device (non-CC)
+ * but only devices (CC). Instead user space in this case would need to
+ * allocate a separate HWPT (non-CC).
*/
if (idev->enforce_cache_coherency) {
- rc = iommufd_hw_pagetable_enforce_cc(hwpt);
+ rc = iommufd_hwpt_paging_enforce_cc(hwpt_paging);
if (WARN_ON(rc))
goto out_abort;
}
@@ -119,11 +175,11 @@ iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
goto out_abort;
}
- rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain);
+ rc = iopt_table_add_domain(&ioas->iopt, hwpt->domain);
if (rc)
goto out_detach;
- list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list);
- return hwpt;
+ list_add_tail(&hwpt_paging->hwpt_item, &ioas->hwpt_list);
+ return hwpt_paging;
out_detach:
if (immediate_attach)
@@ -133,32 +189,120 @@ out_abort:
return ERR_PTR(rc);
}
+/**
+ * iommufd_hwpt_nested_alloc() - Get a NESTED iommu_domain for a device
+ * @ictx: iommufd context
+ * @parent: Parent PAGING-type hwpt to associate the domain with
+ * @idev: Device to get an iommu_domain for
+ * @flags: Flags from userspace
+ * @user_data: user_data pointer. Must be valid
+ *
+ * Allocate a new iommu_domain (must be IOMMU_DOMAIN_NESTED) and return it as
+ * a NESTED hw_pagetable. The given parent PAGING-type hwpt must be capable of
+ * being a parent.
+ */
+static struct iommufd_hwpt_nested *
+iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+ struct iommufd_hwpt_paging *parent,
+ struct iommufd_device *idev, u32 flags,
+ const struct iommu_user_data *user_data)
+{
+ const struct iommu_ops *ops = dev_iommu_ops(idev->dev);
+ struct iommufd_hwpt_nested *hwpt_nested;
+ struct iommufd_hw_pagetable *hwpt;
+ int rc;
+
+ if (flags || !user_data->len || !ops->domain_alloc_user)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (parent->auto_domain || !parent->nest_parent)
+ return ERR_PTR(-EINVAL);
+
+ hwpt_nested = __iommufd_object_alloc(
+ ictx, hwpt_nested, IOMMUFD_OBJ_HWPT_NESTED, common.obj);
+ if (IS_ERR(hwpt_nested))
+ return ERR_CAST(hwpt_nested);
+ hwpt = &hwpt_nested->common;
+
+ refcount_inc(&parent->common.obj.users);
+ hwpt_nested->parent = parent;
+
+ hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
+ parent->common.domain, user_data);
+ if (IS_ERR(hwpt->domain)) {
+ rc = PTR_ERR(hwpt->domain);
+ hwpt->domain = NULL;
+ goto out_abort;
+ }
+
+ if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+ rc = -EINVAL;
+ goto out_abort;
+ }
+ return hwpt_nested;
+
+out_abort:
+ iommufd_object_abort_and_destroy(ictx, &hwpt->obj);
+ return ERR_PTR(rc);
+}
+
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
{
struct iommu_hwpt_alloc *cmd = ucmd->cmd;
+ const struct iommu_user_data user_data = {
+ .type = cmd->data_type,
+ .uptr = u64_to_user_ptr(cmd->data_uptr),
+ .len = cmd->data_len,
+ };
struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_ioas *ioas = NULL;
+ struct iommufd_object *pt_obj;
struct iommufd_device *idev;
- struct iommufd_ioas *ioas;
int rc;
- if (cmd->flags || cmd->__reserved)
+ if (cmd->__reserved)
return -EOPNOTSUPP;
+ if (cmd->data_type == IOMMU_HWPT_DATA_NONE && cmd->data_len)
+ return -EINVAL;
idev = iommufd_get_device(ucmd, cmd->dev_id);
if (IS_ERR(idev))
return PTR_ERR(idev);
- ioas = iommufd_get_ioas(ucmd->ictx, cmd->pt_id);
- if (IS_ERR(ioas)) {
- rc = PTR_ERR(ioas);
+ pt_obj = iommufd_get_object(ucmd->ictx, cmd->pt_id, IOMMUFD_OBJ_ANY);
+ if (IS_ERR(pt_obj)) {
+ rc = -EINVAL;
goto out_put_idev;
}
- mutex_lock(&ioas->mutex);
- hwpt = iommufd_hw_pagetable_alloc(ucmd->ictx, ioas, idev, false);
- if (IS_ERR(hwpt)) {
- rc = PTR_ERR(hwpt);
- goto out_unlock;
+ if (pt_obj->type == IOMMUFD_OBJ_IOAS) {
+ struct iommufd_hwpt_paging *hwpt_paging;
+
+ ioas = container_of(pt_obj, struct iommufd_ioas, obj);
+ mutex_lock(&ioas->mutex);
+ hwpt_paging = iommufd_hwpt_paging_alloc(
+ ucmd->ictx, ioas, idev, cmd->flags, false,
+ user_data.len ? &user_data : NULL);
+ if (IS_ERR(hwpt_paging)) {
+ rc = PTR_ERR(hwpt_paging);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_paging->common;
+ } else if (pt_obj->type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_nested *hwpt_nested;
+
+ hwpt_nested = iommufd_hwpt_nested_alloc(
+ ucmd->ictx,
+ container_of(pt_obj, struct iommufd_hwpt_paging,
+ common.obj),
+ idev, cmd->flags, &user_data);
+ if (IS_ERR(hwpt_nested)) {
+ rc = PTR_ERR(hwpt_nested);
+ goto out_unlock;
+ }
+ hwpt = &hwpt_nested->common;
+ } else {
+ rc = -EINVAL;
+ goto out_put_pt;
}
cmd->out_hwpt_id = hwpt->obj.id;
@@ -171,9 +315,59 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
out_hwpt:
iommufd_object_abort_and_destroy(ucmd->ictx, &hwpt->obj);
out_unlock:
- mutex_unlock(&ioas->mutex);
- iommufd_put_object(&ioas->obj);
+ if (ioas)
+ mutex_unlock(&ioas->mutex);
+out_put_pt:
+ iommufd_put_object(pt_obj);
out_put_idev:
iommufd_put_object(&idev->obj);
return rc;
}
+
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_set_dirty_tracking *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+ bool enable;
+
+ if (cmd->flags & ~IOMMU_HWPT_DIRTY_TRACKING_ENABLE)
+ return rc;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ enable = cmd->flags & IOMMU_HWPT_DIRTY_TRACKING_ENABLE;
+
+ rc = iopt_set_dirty_tracking(&ioas->iopt, hwpt_paging->common.domain,
+ enable);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
+
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd)
+{
+ struct iommu_hwpt_get_dirty_bitmap *cmd = ucmd->cmd;
+ struct iommufd_hwpt_paging *hwpt_paging;
+ struct iommufd_ioas *ioas;
+ int rc = -EOPNOTSUPP;
+
+ if ((cmd->flags & ~(IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR)) ||
+ cmd->__reserved)
+ return -EOPNOTSUPP;
+
+ hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+ if (IS_ERR(hwpt_paging))
+ return PTR_ERR(hwpt_paging);
+
+ ioas = hwpt_paging->ioas;
+ rc = iopt_read_and_clear_dirty_data(
+ &ioas->iopt, hwpt_paging->common.domain, cmd->flags, cmd);
+
+ iommufd_put_object(&hwpt_paging->common.obj);
+ return rc;
+}
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 3a598182b761..504ac1b01b2d 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/errno.h>
+#include <uapi/linux/iommufd.h>
#include "io_pagetable.h"
#include "double_span.h"
@@ -221,6 +222,18 @@ static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
return 0;
}
+static struct iopt_area *iopt_area_alloc(void)
+{
+ struct iopt_area *area;
+
+ area = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ if (!area)
+ return NULL;
+ RB_CLEAR_NODE(&area->node.rb);
+ RB_CLEAR_NODE(&area->pages_node.rb);
+ return area;
+}
+
static int iopt_alloc_area_pages(struct io_pagetable *iopt,
struct list_head *pages_list,
unsigned long length, unsigned long *dst_iova,
@@ -231,7 +244,7 @@ static int iopt_alloc_area_pages(struct io_pagetable *iopt,
int rc = 0;
list_for_each_entry(elm, pages_list, next) {
- elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
+ elm->area = iopt_area_alloc();
if (!elm->area)
return -ENOMEM;
}
@@ -412,6 +425,177 @@ int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
return 0;
}
+struct iova_bitmap_fn_arg {
+ unsigned long flags;
+ struct io_pagetable *iopt;
+ struct iommu_domain *domain;
+ struct iommu_dirty_bitmap *dirty;
+};
+
+static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length,
+ void *opaque)
+{
+ struct iopt_area *area;
+ struct iopt_area_contig_iter iter;
+ struct iova_bitmap_fn_arg *arg = opaque;
+ struct iommu_domain *domain = arg->domain;
+ struct iommu_dirty_bitmap *dirty = arg->dirty;
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ unsigned long last_iova = iova + length - 1;
+ unsigned long flags = arg->flags;
+ int ret;
+
+ iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) {
+ unsigned long last = min(last_iova, iopt_area_last_iova(area));
+
+ ret = ops->read_and_clear_dirty(domain, iter.cur_iova,
+ last - iter.cur_iova + 1, flags,
+ dirty);
+ if (ret)
+ return ret;
+ }
+
+ if (!iopt_area_contig_done(&iter))
+ return -EINVAL;
+ return 0;
+}
+
+static int
+iommu_read_and_clear_dirty(struct iommu_domain *domain,
+ struct io_pagetable *iopt, unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iova_bitmap_fn_arg arg;
+ struct iova_bitmap *iter;
+ int ret = 0;
+
+ if (!ops || !ops->read_and_clear_dirty)
+ return -EOPNOTSUPP;
+
+ iter = iova_bitmap_alloc(bitmap->iova, bitmap->length,
+ bitmap->page_size,
+ u64_to_user_ptr(bitmap->data));
+ if (IS_ERR(iter))
+ return -ENOMEM;
+
+ iommu_dirty_bitmap_init(&dirty, iter, &gather);
+
+ arg.flags = flags;
+ arg.iopt = iopt;
+ arg.domain = domain;
+ arg.dirty = &dirty;
+ iova_bitmap_for_each(iter, &arg, __iommu_read_and_clear_dirty);
+
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR))
+ iommu_iotlb_sync(domain, &gather);
+
+ iova_bitmap_free(iter);
+
+ return ret;
+}
+
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ size_t iommu_pgsize = iopt->iova_alignment;
+ u64 last_iova;
+
+ if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova))
+ return -EOVERFLOW;
+
+ if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX)
+ return -EOVERFLOW;
+
+ if ((bitmap->iova & (iommu_pgsize - 1)) ||
+ ((last_iova + 1) & (iommu_pgsize - 1)))
+ return -EINVAL;
+
+ if (!bitmap->page_size)
+ return -EINVAL;
+
+ if ((bitmap->iova & (bitmap->page_size - 1)) ||
+ ((last_iova + 1) & (bitmap->page_size - 1)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap)
+{
+ int ret;
+
+ ret = iommufd_check_iova_range(iopt, bitmap);
+ if (ret)
+ return ret;
+
+ down_read(&iopt->iova_rwsem);
+ ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap);
+ up_read(&iopt->iova_rwsem);
+
+ return ret;
+}
+
+static int iopt_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ struct iommu_iotlb_gather gather;
+ struct iommu_dirty_bitmap dirty;
+ struct iopt_area *area;
+ int ret = 0;
+
+ lockdep_assert_held_read(&iopt->iova_rwsem);
+
+ iommu_dirty_bitmap_init(&dirty, NULL, &gather);
+
+ for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
+ area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
+ if (!area->pages)
+ continue;
+
+ ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area),
+ iopt_area_length(area), 0,
+ &dirty);
+ if (ret)
+ break;
+ }
+
+ iommu_iotlb_sync(domain, &gather);
+ return ret;
+}
+
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable)
+{
+ const struct iommu_dirty_ops *ops = domain->dirty_ops;
+ int ret = 0;
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ down_read(&iopt->iova_rwsem);
+
+ /* Clear dirty bits from PTEs to ensure a clean snapshot */
+ if (enable) {
+ ret = iopt_clear_dirty_data(iopt, domain);
+ if (ret)
+ goto out_unlock;
+ }
+
+ ret = ops->set_dirty_tracking(domain, enable);
+
+out_unlock:
+ up_read(&iopt->iova_rwsem);
+ return ret;
+}
+
int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, struct list_head *pages_list)
{
@@ -1005,11 +1189,11 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
iopt_area_start_byte(area, new_start) & (alignment - 1))
return -EINVAL;
- lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ lhs = iopt_area_alloc();
if (!lhs)
return -ENOMEM;
- rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
+ rhs = iopt_area_alloc();
if (!rhs) {
rc = -ENOMEM;
goto err_free_lhs;
@@ -1048,6 +1232,16 @@ static int iopt_area_split(struct iopt_area *area, unsigned long iova)
if (WARN_ON(rc))
goto err_remove_lhs;
+ /*
+ * If the original area has filled a domain, domains_itree has to be
+ * updated.
+ */
+ if (area->storage_domain) {
+ interval_tree_remove(&area->pages_node, &pages->domains_itree);
+ interval_tree_insert(&lhs->pages_node, &pages->domains_itree);
+ interval_tree_insert(&rhs->pages_node, &pages->domains_itree);
+ }
+
lhs->storage_domain = area->storage_domain;
lhs->pages = area->pages;
rhs->storage_domain = area->storage_domain;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 2c58670011fe..a74cfefffbc6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -8,6 +8,9 @@
#include <linux/xarray.h>
#include <linux/refcount.h>
#include <linux/uaccess.h>
+#include <linux/iommu.h>
+#include <linux/iova_bitmap.h>
+#include <uapi/linux/iommufd.h>
struct iommu_domain;
struct iommu_group;
@@ -70,6 +73,13 @@ int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
unsigned long length, unsigned long *unmapped);
int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped);
+int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt,
+ struct iommu_domain *domain,
+ unsigned long flags,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
+int iopt_set_dirty_tracking(struct io_pagetable *iopt,
+ struct iommu_domain *domain, bool enable);
+
void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova,
unsigned long length);
int iopt_table_add_domain(struct io_pagetable *iopt,
@@ -113,7 +123,8 @@ enum iommufd_object_type {
IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE,
IOMMUFD_OBJ_DEVICE,
- IOMMUFD_OBJ_HW_PAGETABLE,
+ IOMMUFD_OBJ_HWPT_PAGING,
+ IOMMUFD_OBJ_HWPT_NESTED,
IOMMUFD_OBJ_IOAS,
IOMMUFD_OBJ_ACCESS,
#ifdef CONFIG_IOMMUFD_TEST
@@ -171,7 +182,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
size_t size,
enum iommufd_object_type type);
-#define iommufd_object_alloc(ictx, ptr, type) \
+#define __iommufd_object_alloc(ictx, ptr, type, obj) \
container_of(_iommufd_object_alloc( \
ictx, \
sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \
@@ -180,6 +191,9 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
type), \
typeof(*(ptr)), obj)
+#define iommufd_object_alloc(ictx, ptr, type) \
+ __iommufd_object_alloc(ictx, ptr, type, obj)
+
/*
* The IO Address Space (IOAS) pagetable is a virtual page table backed by the
* io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The
@@ -222,6 +236,8 @@ int iommufd_option_rlimit_mode(struct iommu_option *cmd,
struct iommufd_ctx *ictx);
int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
+int iommufd_check_iova_range(struct io_pagetable *iopt,
+ struct iommu_hwpt_get_dirty_bitmap *bitmap);
/*
* A HW pagetable is called an iommu_domain inside the kernel. This user object
@@ -231,35 +247,75 @@ int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd);
*/
struct iommufd_hw_pagetable {
struct iommufd_object obj;
- struct iommufd_ioas *ioas;
struct iommu_domain *domain;
+};
+
+struct iommufd_hwpt_paging {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_ioas *ioas;
bool auto_domain : 1;
bool enforce_cache_coherency : 1;
bool msi_cookie : 1;
+ bool nest_parent : 1;
/* Head at iommufd_ioas::hwpt_list */
struct list_head hwpt_item;
};
-struct iommufd_hw_pagetable *
-iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
- struct iommufd_device *idev, bool immediate_attach);
-int iommufd_hw_pagetable_enforce_cc(struct iommufd_hw_pagetable *hwpt);
+struct iommufd_hwpt_nested {
+ struct iommufd_hw_pagetable common;
+ struct iommufd_hwpt_paging *parent;
+};
+
+static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING;
+}
+
+static inline struct iommufd_hwpt_paging *
+to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+ return container_of(hwpt, struct iommufd_hwpt_paging, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
+{
+ return container_of(iommufd_get_object(ucmd->ictx, id,
+ IOMMUFD_OBJ_HWPT_PAGING),
+ struct iommufd_hwpt_paging, common.obj);
+}
+int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd);
+int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd);
+
+struct iommufd_hwpt_paging *
+iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
+ struct iommufd_device *idev, u32 flags,
+ bool immediate_attach,
+ const struct iommu_user_data *user_data);
int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
struct iommufd_device *idev);
struct iommufd_hw_pagetable *
iommufd_hw_pagetable_detach(struct iommufd_device *idev);
-void iommufd_hw_pagetable_destroy(struct iommufd_object *obj);
-void iommufd_hw_pagetable_abort(struct iommufd_object *obj);
+void iommufd_hwpt_paging_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_paging_abort(struct iommufd_object *obj);
+void iommufd_hwpt_nested_destroy(struct iommufd_object *obj);
+void iommufd_hwpt_nested_abort(struct iommufd_object *obj);
int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd);
static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx,
struct iommufd_hw_pagetable *hwpt)
{
- lockdep_assert_not_held(&hwpt->ioas->mutex);
- if (hwpt->auto_domain)
- iommufd_object_deref_user(ictx, &hwpt->obj);
- else
- refcount_dec(&hwpt->obj.users);
+ if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) {
+ struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt);
+
+ lockdep_assert_not_held(&hwpt_paging->ioas->mutex);
+
+ if (hwpt_paging->auto_domain) {
+ iommufd_object_deref_user(ictx, &hwpt->obj);
+ return;
+ }
+ }
+ refcount_dec(&hwpt->obj.users);
}
struct iommufd_group {
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 3f3644375bf1..7910fbe1962d 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -19,6 +19,8 @@ enum {
IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT,
IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE,
IOMMU_TEST_OP_ACCESS_REPLACE_IOAS,
+ IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
+ IOMMU_TEST_OP_DIRTY,
};
enum {
@@ -40,6 +42,15 @@ enum {
MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0,
};
+enum {
+ MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0,
+};
+
+enum {
+ MOCK_NESTED_DOMAIN_IOTLB_ID_MAX = 3,
+ MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
+};
+
struct iommu_test_cmd {
__u32 size;
__u32 op;
@@ -57,6 +68,13 @@ struct iommu_test_cmd {
__u32 out_idev_id;
} mock_domain;
struct {
+ __u32 out_stdev_id;
+ __u32 out_hwpt_id;
+ __u32 out_idev_id;
+ /* Expand mock_domain to set mock device flags */
+ __u32 dev_flags;
+ } mock_domain_flags;
+ struct {
__u32 pt_id;
} mock_domain_replace;
struct {
@@ -95,6 +113,14 @@ struct iommu_test_cmd {
struct {
__u32 ioas_id;
} access_replace_ioas;
+ struct {
+ __u32 flags;
+ __aligned_u64 iova;
+ __aligned_u64 length;
+ __aligned_u64 page_size;
+ __aligned_u64 uptr;
+ __aligned_u64 out_nr_dirty;
+ } dirty;
};
__u32 last;
};
@@ -109,4 +135,17 @@ struct iommu_test_hw_info {
__u32 test_reg;
};
+/* Should not be equal to any defined value in enum iommu_hwpt_data_type */
+#define IOMMU_HWPT_DATA_SELFTEST 0xdead
+#define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+
+/**
+ * struct iommu_hwpt_selftest
+ *
+ * @iotlb: default mock iotlb value, IOMMU_TEST_IOTLB_DEFAULT
+ */
+struct iommu_hwpt_selftest {
+ __u32 iotlb;
+};
+
#endif
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
new file mode 100644
index 000000000000..0a92c9eeaf7f
--- /dev/null
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -0,0 +1,426 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, Oracle and/or its affiliates.
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+#include <linux/iova_bitmap.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
+
+/*
+ * struct iova_bitmap_map - A bitmap representing an IOVA range
+ *
+ * Main data structure for tracking mapped user pages of bitmap data.
+ *
+ * For example, for something recording dirty IOVAs, it will be provided a
+ * struct iova_bitmap structure, as a general structure for iterating the
+ * total IOVA range. The struct iova_bitmap_map, though, represents the
+ * subset of said IOVA space that is pinned by its parent structure (struct
+ * iova_bitmap).
+ *
+ * The user does not need to exact location of the bits in the bitmap.
+ * From user perspective the only API available is iova_bitmap_set() which
+ * records the IOVA *range* in the bitmap by setting the corresponding
+ * bits.
+ *
+ * The bitmap is an array of u64 whereas each bit represents an IOVA of
+ * range of (1 << pgshift). Thus formula for the bitmap data to be set is:
+ *
+ * data[(iova / page_size) / 64] & (1ULL << (iova % 64))
+ */
+struct iova_bitmap_map {
+ /* base IOVA representing bit 0 of the first page */
+ unsigned long iova;
+
+ /* page size order that each bit granules to */
+ unsigned long pgshift;
+
+ /* page offset of the first user page pinned */
+ unsigned long pgoff;
+
+ /* number of pages pinned */
+ unsigned long npages;
+
+ /* pinned pages representing the bitmap data */
+ struct page **pages;
+};
+
+/*
+ * struct iova_bitmap - The IOVA bitmap object
+ *
+ * Main data structure for iterating over the bitmap data.
+ *
+ * Abstracts the pinning work and iterates in IOVA ranges.
+ * It uses a windowing scheme and pins the bitmap in relatively
+ * big ranges e.g.
+ *
+ * The bitmap object uses one base page to store all the pinned pages
+ * pointers related to the bitmap. For sizeof(struct page*) == 8 it stores
+ * 512 struct page pointers which, if the base page size is 4K, it means
+ * 2M of bitmap data is pinned at a time. If the iova_bitmap page size is
+ * also 4K then the range window to iterate is 64G.
+ *
+ * For example iterating on a total IOVA range of 4G..128G, it will walk
+ * through this set of ranges:
+ *
+ * 4G - 68G-1 (64G)
+ * 68G - 128G-1 (64G)
+ *
+ * An example of the APIs on how to use/iterate over the IOVA bitmap:
+ *
+ * bitmap = iova_bitmap_alloc(iova, length, page_size, data);
+ * if (IS_ERR(bitmap))
+ * return PTR_ERR(bitmap);
+ *
+ * ret = iova_bitmap_for_each(bitmap, arg, dirty_reporter_fn);
+ *
+ * iova_bitmap_free(bitmap);
+ *
+ * Each iteration of the @dirty_reporter_fn is called with a unique @iova
+ * and @length argument, indicating the current range available through the
+ * iova_bitmap. The @dirty_reporter_fn uses iova_bitmap_set() to mark dirty
+ * areas (@iova_length) within that provided range, as following:
+ *
+ * iova_bitmap_set(bitmap, iova, iova_length);
+ *
+ * The internals of the object uses an index @mapped_base_index that indexes
+ * which u64 word of the bitmap is mapped, up to @mapped_total_index.
+ * Those keep being incremented until @mapped_total_index is reached while
+ * mapping up to PAGE_SIZE / sizeof(struct page*) maximum of pages.
+ *
+ * The IOVA bitmap is usually located on what tracks DMA mapped ranges or
+ * some form of IOVA range tracking that co-relates to the user passed
+ * bitmap.
+ */
+struct iova_bitmap {
+ /* IOVA range representing the currently mapped bitmap data */
+ struct iova_bitmap_map mapped;
+
+ /* userspace address of the bitmap */
+ u64 __user *bitmap;
+
+ /* u64 index that @mapped points to */
+ unsigned long mapped_base_index;
+
+ /* how many u64 can we walk in total */
+ unsigned long mapped_total_index;
+
+ /* base IOVA of the whole bitmap */
+ unsigned long iova;
+
+ /* length of the IOVA range for the whole bitmap */
+ size_t length;
+};
+
+/*
+ * Converts a relative IOVA to a bitmap index.
+ * This function provides the index into the u64 array (bitmap::bitmap)
+ * for a given IOVA offset.
+ * Relative IOVA means relative to the bitmap::mapped base IOVA
+ * (stored in mapped::iova). All computations in this file are done using
+ * relative IOVAs and thus avoid an extra subtraction against mapped::iova.
+ * The user API iova_bitmap_set() always uses a regular absolute IOVAs.
+ */
+static unsigned long iova_bitmap_offset_to_index(struct iova_bitmap *bitmap,
+ unsigned long iova)
+{
+ unsigned long pgsize = 1 << bitmap->mapped.pgshift;
+
+ return iova / (BITS_PER_TYPE(*bitmap->bitmap) * pgsize);
+}
+
+/*
+ * Converts a bitmap index to a *relative* IOVA.
+ */
+static unsigned long iova_bitmap_index_to_offset(struct iova_bitmap *bitmap,
+ unsigned long index)
+{
+ unsigned long pgshift = bitmap->mapped.pgshift;
+
+ return (index * BITS_PER_TYPE(*bitmap->bitmap)) << pgshift;
+}
+
+/*
+ * Returns the base IOVA of the mapped range.
+ */
+static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
+{
+ unsigned long skip = bitmap->mapped_base_index;
+
+ return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
+}
+
+/*
+ * Pins the bitmap user pages for the current range window.
+ * This is internal to IOVA bitmap and called when advancing the
+ * index (@mapped_base_index) or allocating the bitmap.
+ */
+static int iova_bitmap_get(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+ unsigned long npages;
+ u64 __user *addr;
+ long ret;
+
+ /*
+ * @mapped_base_index is the index of the currently mapped u64 words
+ * that we have access. Anything before @mapped_base_index is not
+ * mapped. The range @mapped_base_index .. @mapped_total_index-1 is
+ * mapped but capped at a maximum number of pages.
+ */
+ npages = DIV_ROUND_UP((bitmap->mapped_total_index -
+ bitmap->mapped_base_index) *
+ sizeof(*bitmap->bitmap), PAGE_SIZE);
+
+ /*
+ * We always cap at max number of 'struct page' a base page can fit.
+ * This is, for example, on x86 means 2M of bitmap data max.
+ */
+ npages = min(npages, PAGE_SIZE / sizeof(struct page *));
+
+ /*
+ * Bitmap address to be pinned is calculated via pointer arithmetic
+ * with bitmap u64 word index.
+ */
+ addr = bitmap->bitmap + bitmap->mapped_base_index;
+
+ ret = pin_user_pages_fast((unsigned long)addr, npages,
+ FOLL_WRITE, mapped->pages);
+ if (ret <= 0)
+ return -EFAULT;
+
+ mapped->npages = (unsigned long)ret;
+ /* Base IOVA where @pages point to i.e. bit 0 of the first page */
+ mapped->iova = iova_bitmap_mapped_iova(bitmap);
+
+ /*
+ * offset of the page where pinned pages bit 0 is located.
+ * This handles the case where the bitmap is not PAGE_SIZE
+ * aligned.
+ */
+ mapped->pgoff = offset_in_page(addr);
+ return 0;
+}
+
+/*
+ * Unpins the bitmap user pages and clears @npages
+ * (un)pinning is abstracted from API user and it's done when advancing
+ * the index or freeing the bitmap.
+ */
+static void iova_bitmap_put(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+ if (mapped->npages) {
+ unpin_user_pages(mapped->pages, mapped->npages);
+ mapped->npages = 0;
+ }
+}
+
+/**
+ * iova_bitmap_alloc() - Allocates an IOVA bitmap object
+ * @iova: Start address of the IOVA range
+ * @length: Length of the IOVA range
+ * @page_size: Page size of the IOVA bitmap. It defines what each bit
+ * granularity represents
+ * @data: Userspace address of the bitmap
+ *
+ * Allocates an IOVA object and initializes all its fields including the
+ * first user pages of @data.
+ *
+ * Return: A pointer to a newly allocated struct iova_bitmap
+ * or ERR_PTR() on error.
+ */
+struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
+ unsigned long page_size, u64 __user *data)
+{
+ struct iova_bitmap_map *mapped;
+ struct iova_bitmap *bitmap;
+ int rc;
+
+ bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
+ if (!bitmap)
+ return ERR_PTR(-ENOMEM);
+
+ mapped = &bitmap->mapped;
+ mapped->pgshift = __ffs(page_size);
+ bitmap->bitmap = data;
+ bitmap->mapped_total_index =
+ iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
+ bitmap->iova = iova;
+ bitmap->length = length;
+ mapped->iova = iova;
+ mapped->pages = (struct page **)__get_free_page(GFP_KERNEL);
+ if (!mapped->pages) {
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ rc = iova_bitmap_get(bitmap);
+ if (rc)
+ goto err;
+ return bitmap;
+
+err:
+ iova_bitmap_free(bitmap);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_alloc, IOMMUFD);
+
+/**
+ * iova_bitmap_free() - Frees an IOVA bitmap object
+ * @bitmap: IOVA bitmap to free
+ *
+ * It unpins and releases pages array memory and clears any leftover
+ * state.
+ */
+void iova_bitmap_free(struct iova_bitmap *bitmap)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+
+ iova_bitmap_put(bitmap);
+
+ if (mapped->pages) {
+ free_page((unsigned long)mapped->pages);
+ mapped->pages = NULL;
+ }
+
+ kfree(bitmap);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_free, IOMMUFD);
+
+/*
+ * Returns the remaining bitmap indexes from mapped_total_index to process for
+ * the currently pinned bitmap pages.
+ */
+static unsigned long iova_bitmap_mapped_remaining(struct iova_bitmap *bitmap)
+{
+ unsigned long remaining, bytes;
+
+ bytes = (bitmap->mapped.npages << PAGE_SHIFT) - bitmap->mapped.pgoff;
+
+ remaining = bitmap->mapped_total_index - bitmap->mapped_base_index;
+ remaining = min_t(unsigned long, remaining,
+ bytes / sizeof(*bitmap->bitmap));
+
+ return remaining;
+}
+
+/*
+ * Returns the length of the mapped IOVA range.
+ */
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
+{
+ unsigned long max_iova = bitmap->iova + bitmap->length - 1;
+ unsigned long iova = iova_bitmap_mapped_iova(bitmap);
+ unsigned long remaining;
+
+ /*
+ * iova_bitmap_mapped_remaining() returns a number of indexes which
+ * when converted to IOVA gives us a max length that the bitmap
+ * pinned data can cover. Afterwards, that is capped to
+ * only cover the IOVA range in @bitmap::iova .. @bitmap::length.
+ */
+ remaining = iova_bitmap_index_to_offset(bitmap,
+ iova_bitmap_mapped_remaining(bitmap));
+
+ if (iova + remaining - 1 > max_iova)
+ remaining -= ((iova + remaining - 1) - max_iova);
+
+ return remaining;
+}
+
+/*
+ * Returns true if there's not more data to iterate.
+ */
+static bool iova_bitmap_done(struct iova_bitmap *bitmap)
+{
+ return bitmap->mapped_base_index >= bitmap->mapped_total_index;
+}
+
+/*
+ * Advances to the next range, releases the current pinned
+ * pages and pins the next set of bitmap pages.
+ * Returns 0 on success or otherwise errno.
+ */
+static int iova_bitmap_advance(struct iova_bitmap *bitmap)
+{
+ unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
+ unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
+
+ bitmap->mapped_base_index += count;
+
+ iova_bitmap_put(bitmap);
+ if (iova_bitmap_done(bitmap))
+ return 0;
+
+ /* When advancing the index we pin the next set of bitmap pages */
+ return iova_bitmap_get(bitmap);
+}
+
+/**
+ * iova_bitmap_for_each() - Iterates over the bitmap
+ * @bitmap: IOVA bitmap to iterate
+ * @opaque: Additional argument to pass to the callback
+ * @fn: Function that gets called for each IOVA range
+ *
+ * Helper function to iterate over bitmap data representing a portion of IOVA
+ * space. It hides the complexity of iterating bitmaps and translating the
+ * mapped bitmap user pages into IOVA ranges to process.
+ *
+ * Return: 0 on success, and an error on failure either upon
+ * iteration or when the callback returns an error.
+ */
+int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
+ iova_bitmap_fn_t fn)
+{
+ int ret = 0;
+
+ for (; !iova_bitmap_done(bitmap) && !ret;
+ ret = iova_bitmap_advance(bitmap)) {
+ ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
+ iova_bitmap_mapped_length(bitmap), opaque);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
+
+/**
+ * iova_bitmap_set() - Records an IOVA range in bitmap
+ * @bitmap: IOVA bitmap
+ * @iova: IOVA to start
+ * @length: IOVA range length
+ *
+ * Set the bits corresponding to the range [iova .. iova+length-1] in
+ * the user bitmap.
+ *
+ */
+void iova_bitmap_set(struct iova_bitmap *bitmap,
+ unsigned long iova, size_t length)
+{
+ struct iova_bitmap_map *mapped = &bitmap->mapped;
+ unsigned long cur_bit = ((iova - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+ unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
+ mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+
+ do {
+ unsigned int page_idx = cur_bit / BITS_PER_PAGE;
+ unsigned int offset = cur_bit % BITS_PER_PAGE;
+ unsigned int nbits = min(BITS_PER_PAGE - offset,
+ last_bit - cur_bit + 1);
+ void *kaddr;
+
+ kaddr = kmap_local_page(mapped->pages[page_idx]);
+ bitmap_set(kaddr, offset, nbits);
+ kunmap_local(kaddr);
+ cur_bit += nbits;
+ } while (cur_bit <= last_bit);
+}
+EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index e71523cbd0de..45b9d40773b1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -307,6 +307,8 @@ union ucmd_buffer {
struct iommu_destroy destroy;
struct iommu_hw_info info;
struct iommu_hwpt_alloc hwpt;
+ struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
+ struct iommu_hwpt_set_dirty_tracking set_dirty_tracking;
struct iommu_ioas_alloc alloc;
struct iommu_ioas_allow_iovas allow_iovas;
struct iommu_ioas_copy ioas_copy;
@@ -342,6 +344,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
__reserved),
IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
__reserved),
+ IOCTL_OP(IOMMU_HWPT_GET_DIRTY_BITMAP, iommufd_hwpt_get_dirty_bitmap,
+ struct iommu_hwpt_get_dirty_bitmap, data),
+ IOCTL_OP(IOMMU_HWPT_SET_DIRTY_TRACKING, iommufd_hwpt_set_dirty_tracking,
+ struct iommu_hwpt_set_dirty_tracking, __reserved),
IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl,
struct iommu_ioas_alloc, out_ioas_id),
IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
@@ -482,9 +488,13 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
[IOMMUFD_OBJ_IOAS] = {
.destroy = iommufd_ioas_destroy,
},
- [IOMMUFD_OBJ_HW_PAGETABLE] = {
- .destroy = iommufd_hw_pagetable_destroy,
- .abort = iommufd_hw_pagetable_abort,
+ [IOMMUFD_OBJ_HWPT_PAGING] = {
+ .destroy = iommufd_hwpt_paging_destroy,
+ .abort = iommufd_hwpt_paging_abort,
+ },
+ [IOMMUFD_OBJ_HWPT_NESTED] = {
+ .destroy = iommufd_hwpt_nested_destroy,
+ .abort = iommufd_hwpt_nested_abort,
},
#ifdef CONFIG_IOMMUFD_TEST
[IOMMUFD_OBJ_SELFTEST] = {
@@ -552,5 +562,6 @@ MODULE_ALIAS_MISCDEV(VFIO_MINOR);
MODULE_ALIAS("devname:vfio/vfio");
#endif
MODULE_IMPORT_NS(IOMMUFD_INTERNAL);
+MODULE_IMPORT_NS(IOMMUFD);
MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices");
MODULE_LICENSE("GPL");
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 8d9aa297c117..528f356238b3 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -1507,6 +1507,8 @@ void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages)
area, domain, iopt_area_index(area),
iopt_area_last_index(area));
+ if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
+ WARN_ON(RB_EMPTY_NODE(&area->pages_node.rb));
interval_tree_remove(&area->pages_node, &pages->domains_itree);
iopt_area_unfill_domain(area, pages, area->storage_domain);
area->storage_domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 56506d5753f1..d43a87737c1e 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -20,10 +20,13 @@
static DECLARE_FAULT_ATTR(fail_iommufd);
static struct dentry *dbgfs_root;
static struct platform_device *selftest_iommu_dev;
+static const struct iommu_ops mock_ops;
+static struct iommu_domain_ops domain_nested_ops;
size_t iommufd_test_memory_limit = 65536;
enum {
+ MOCK_DIRTY_TRACK = 1,
MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2,
/*
@@ -36,6 +39,7 @@ enum {
_MOCK_PFN_START = MOCK_PFN_MASK + 1,
MOCK_PFN_START_IOVA = _MOCK_PFN_START,
MOCK_PFN_LAST_IOVA = _MOCK_PFN_START,
+ MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1,
};
/*
@@ -86,16 +90,24 @@ void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd,
}
struct mock_iommu_domain {
+ unsigned long flags;
struct iommu_domain domain;
struct xarray pfns;
};
+struct mock_iommu_domain_nested {
+ struct iommu_domain domain;
+ struct mock_iommu_domain *parent;
+ u32 iotlb[MOCK_NESTED_DOMAIN_IOTLB_NUM];
+};
+
enum selftest_obj_type {
TYPE_IDEV,
};
struct mock_dev {
struct device dev;
+ unsigned long flags;
};
struct selftest_obj {
@@ -118,6 +130,11 @@ static void mock_domain_blocking_free(struct iommu_domain *domain)
static int mock_domain_nop_attach(struct iommu_domain *domain,
struct device *dev)
{
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ if (domain->dirty_ops && (mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return -EINVAL;
+
return 0;
}
@@ -146,15 +163,70 @@ static void *mock_domain_hw_info(struct device *dev, u32 *length, u32 *type)
return info;
}
-static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+static int mock_domain_set_dirty_tracking(struct iommu_domain *domain,
+ bool enable)
{
- struct mock_iommu_domain *mock;
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long flags = mock->flags;
- if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
- return &mock_blocking_domain;
+ if (enable && !domain->dirty_ops)
+ return -EINVAL;
- if (iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)
- return NULL;
+ /* No change? */
+ if (!(enable ^ !!(flags & MOCK_DIRTY_TRACK)))
+ return 0;
+
+ flags = (enable ? flags | MOCK_DIRTY_TRACK : flags & ~MOCK_DIRTY_TRACK);
+
+ mock->flags = flags;
+ return 0;
+}
+
+static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
+ unsigned long iova, size_t size,
+ unsigned long flags,
+ struct iommu_dirty_bitmap *dirty)
+{
+ struct mock_iommu_domain *mock =
+ container_of(domain, struct mock_iommu_domain, domain);
+ unsigned long i, max = size / MOCK_IO_PAGE_SIZE;
+ void *ent, *old;
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap)
+ return -EINVAL;
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * MOCK_IO_PAGE_SIZE;
+
+ ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE);
+ if (ent && (xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) {
+ /* Clear dirty */
+ if (!(flags & IOMMU_DIRTY_NO_CLEAR)) {
+ unsigned long val;
+
+ val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns,
+ cur / MOCK_IO_PAGE_SIZE,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ }
+ iommu_dirty_bitmap_record(dirty, cur,
+ MOCK_IO_PAGE_SIZE);
+ }
+ }
+
+ return 0;
+}
+
+const struct iommu_dirty_ops dirty_ops = {
+ .set_dirty_tracking = mock_domain_set_dirty_tracking,
+ .read_and_clear_dirty = mock_domain_read_and_clear_dirty,
+};
+
+static struct iommu_domain *mock_domain_alloc_paging(struct device *dev)
+{
+ struct mock_iommu_domain *mock;
mock = kzalloc(sizeof(*mock), GFP_KERNEL);
if (!mock)
@@ -162,10 +234,87 @@ static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
mock->domain.geometry.aperture_start = MOCK_APERTURE_START;
mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST;
mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE;
+ mock->domain.ops = mock_ops.default_domain_ops;
+ mock->domain.type = IOMMU_DOMAIN_UNMANAGED;
xa_init(&mock->pfns);
return &mock->domain;
}
+static struct iommu_domain *
+__mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
+ const struct iommu_hwpt_selftest *user_cfg)
+{
+ struct mock_iommu_domain_nested *mock_nested;
+ int i;
+
+ mock_nested = kzalloc(sizeof(*mock_nested), GFP_KERNEL);
+ if (!mock_nested)
+ return ERR_PTR(-ENOMEM);
+ mock_nested->parent = mock_parent;
+ mock_nested->domain.ops = &domain_nested_ops;
+ mock_nested->domain.type = IOMMU_DOMAIN_NESTED;
+ for (i = 0; i < MOCK_NESTED_DOMAIN_IOTLB_NUM; i++)
+ mock_nested->iotlb[i] = user_cfg->iotlb;
+ return &mock_nested->domain;
+}
+
+static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type)
+{
+ if (iommu_domain_type == IOMMU_DOMAIN_BLOCKED)
+ return &mock_blocking_domain;
+ if (iommu_domain_type == IOMMU_DOMAIN_UNMANAGED)
+ return mock_domain_alloc_paging(NULL);
+ return NULL;
+}
+
+static struct iommu_domain *
+mock_domain_alloc_user(struct device *dev, u32 flags,
+ struct iommu_domain *parent,
+ const struct iommu_user_data *user_data)
+{
+ struct mock_iommu_domain *mock_parent;
+ struct iommu_hwpt_selftest user_cfg;
+ int rc;
+
+ /* must be mock_domain */
+ if (!parent) {
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+ bool has_dirty_flag = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+ bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY;
+ struct iommu_domain *domain;
+
+ if (flags & (~(IOMMU_HWPT_ALLOC_NEST_PARENT |
+ IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
+ return ERR_PTR(-EOPNOTSUPP);
+ if (user_data || (has_dirty_flag && no_dirty_ops))
+ return ERR_PTR(-EOPNOTSUPP);
+ domain = mock_domain_alloc_paging(NULL);
+ if (!domain)
+ return ERR_PTR(-ENOMEM);
+ if (has_dirty_flag)
+ container_of(domain, struct mock_iommu_domain, domain)
+ ->domain.dirty_ops = &dirty_ops;
+ return domain;
+ }
+
+ /* must be mock_domain_nested */
+ if (user_data->type != IOMMU_HWPT_DATA_SELFTEST || flags)
+ return ERR_PTR(-EOPNOTSUPP);
+ if (!parent || parent->ops != mock_ops.default_domain_ops)
+ return ERR_PTR(-EINVAL);
+
+ mock_parent = container_of(parent, struct mock_iommu_domain, domain);
+ if (!mock_parent)
+ return ERR_PTR(-EINVAL);
+
+ rc = iommu_copy_struct_from_user(&user_cfg, user_data,
+ IOMMU_HWPT_DATA_SELFTEST, iotlb);
+ if (rc)
+ return ERR_PTR(rc);
+
+ return __mock_domain_alloc_nested(mock_parent, &user_cfg);
+}
+
static void mock_domain_free(struct iommu_domain *domain)
{
struct mock_iommu_domain *mock =
@@ -243,7 +392,7 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) {
ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE);
- WARN_ON(!ent);
+
/*
* iommufd generates unmaps that must be a strict
* superset of the map's performend So every starting
@@ -253,13 +402,13 @@ static size_t mock_domain_unmap_pages(struct iommu_domain *domain,
* passed to map_pages
*/
if (first) {
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_START_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_START_IOVA));
first = false;
}
if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize)
- WARN_ON(!(xa_to_value(ent) &
- MOCK_PFN_LAST_IOVA));
+ WARN_ON(ent && !(xa_to_value(ent) &
+ MOCK_PFN_LAST_IOVA));
iova += MOCK_IO_PAGE_SIZE;
ret += MOCK_IO_PAGE_SIZE;
@@ -283,7 +432,18 @@ static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain,
static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
{
- return cap == IOMMU_CAP_CACHE_COHERENCY;
+ struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+ case IOMMU_CAP_DIRTY_TRACKING:
+ return !(mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY);
+ default:
+ break;
+ }
+
+ return false;
}
static void mock_domain_set_plaform_dma_ops(struct device *dev)
@@ -307,6 +467,7 @@ static const struct iommu_ops mock_ops = {
.pgsize_bitmap = MOCK_IO_PAGE_SIZE,
.hw_info = mock_domain_hw_info,
.domain_alloc = mock_domain_alloc,
+ .domain_alloc_user = mock_domain_alloc_user,
.capable = mock_domain_capable,
.set_platform_dma_ops = mock_domain_set_plaform_dma_ops,
.device_group = generic_device_group,
@@ -321,19 +482,41 @@ static const struct iommu_ops mock_ops = {
},
};
+static void mock_domain_free_nested(struct iommu_domain *domain)
+{
+ struct mock_iommu_domain_nested *mock_nested =
+ container_of(domain, struct mock_iommu_domain_nested, domain);
+
+ kfree(mock_nested);
+}
+
+static struct iommu_domain_ops domain_nested_ops = {
+ .free = mock_domain_free_nested,
+ .attach_dev = mock_domain_nop_attach,
+};
+
static inline struct iommufd_hw_pagetable *
-get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
- struct mock_iommu_domain **mock)
+__get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, u32 hwpt_type)
{
- struct iommufd_hw_pagetable *hwpt;
struct iommufd_object *obj;
- obj = iommufd_get_object(ucmd->ictx, mockpt_id,
- IOMMUFD_OBJ_HW_PAGETABLE);
+ obj = iommufd_get_object(ucmd->ictx, mockpt_id, hwpt_type);
if (IS_ERR(obj))
return ERR_CAST(obj);
- hwpt = container_of(obj, struct iommufd_hw_pagetable, obj);
- if (hwpt->domain->ops != mock_ops.default_domain_ops) {
+ return container_of(obj, struct iommufd_hw_pagetable, obj);
+}
+
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain **mock)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_PAGING);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED ||
+ hwpt->domain->ops != mock_ops.default_domain_ops) {
iommufd_put_object(&hwpt->obj);
return ERR_PTR(-EINVAL);
}
@@ -341,6 +524,25 @@ get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id,
return hwpt;
}
+static inline struct iommufd_hw_pagetable *
+get_md_pagetable_nested(struct iommufd_ucmd *ucmd, u32 mockpt_id,
+ struct mock_iommu_domain_nested **mock_nested)
+{
+ struct iommufd_hw_pagetable *hwpt;
+
+ hwpt = __get_md_pagetable(ucmd, mockpt_id, IOMMUFD_OBJ_HWPT_NESTED);
+ if (IS_ERR(hwpt))
+ return hwpt;
+ if (hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
+ hwpt->domain->ops != &domain_nested_ops) {
+ iommufd_put_object(&hwpt->obj);
+ return ERR_PTR(-EINVAL);
+ }
+ *mock_nested = container_of(hwpt->domain,
+ struct mock_iommu_domain_nested, domain);
+ return hwpt;
+}
+
struct mock_bus_type {
struct bus_type bus;
struct notifier_block nb;
@@ -362,16 +564,20 @@ static void mock_dev_release(struct device *dev)
kfree(mdev);
}
-static struct mock_dev *mock_dev_create(void)
+static struct mock_dev *mock_dev_create(unsigned long dev_flags)
{
struct mock_dev *mdev;
int rc;
+ if (dev_flags & ~(MOCK_FLAGS_DEVICE_NO_DIRTY))
+ return ERR_PTR(-EINVAL);
+
mdev = kzalloc(sizeof(*mdev), GFP_KERNEL);
if (!mdev)
return ERR_PTR(-ENOMEM);
device_initialize(&mdev->dev);
+ mdev->flags = dev_flags;
mdev->dev.release = mock_dev_release;
mdev->dev.bus = &iommufd_mock_bus_type.bus;
@@ -407,6 +613,7 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
struct iommufd_device *idev;
struct selftest_obj *sobj;
u32 pt_id = cmd->id;
+ u32 dev_flags = 0;
u32 idev_id;
int rc;
@@ -417,7 +624,10 @@ static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd,
sobj->idev.ictx = ucmd->ictx;
sobj->type = TYPE_IDEV;
- sobj->idev.mock_dev = mock_dev_create();
+ if (cmd->op == IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS)
+ dev_flags = cmd->mock_domain_flags.dev_flags;
+
+ sobj->idev.mock_dev = mock_dev_create(dev_flags);
if (IS_ERR(sobj->idev.mock_dev)) {
rc = PTR_ERR(sobj->idev.mock_dev);
goto out_sobj;
@@ -977,6 +1187,73 @@ static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE);
static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH ==
__IOMMUFD_ACCESS_RW_SLOW_PATH);
+static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
+ unsigned long iova, size_t length,
+ unsigned long page_size, void __user *uptr,
+ u32 flags)
+{
+ unsigned long bitmap_size, i, max;
+ struct iommu_test_cmd *cmd = ucmd->cmd;
+ struct iommufd_hw_pagetable *hwpt;
+ struct mock_iommu_domain *mock;
+ int rc, count = 0;
+ void *tmp;
+
+ if (!page_size || !length || iova % page_size || length % page_size ||
+ !uptr)
+ return -EINVAL;
+
+ hwpt = get_md_pagetable(ucmd, mockpt_id, &mock);
+ if (IS_ERR(hwpt))
+ return PTR_ERR(hwpt);
+
+ if (!(mock->flags & MOCK_DIRTY_TRACK)) {
+ rc = -EINVAL;
+ goto out_put;
+ }
+
+ max = length / page_size;
+ bitmap_size = max / BITS_PER_BYTE;
+
+ tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+ if (!tmp) {
+ rc = -ENOMEM;
+ goto out_put;
+ }
+
+ if (copy_from_user(tmp, uptr, bitmap_size)) {
+ rc = -EFAULT;
+ goto out_free;
+ }
+
+ for (i = 0; i < max; i++) {
+ unsigned long cur = iova + i * page_size;
+ void *ent, *old;
+
+ if (!test_bit(i, (unsigned long *)tmp))
+ continue;
+
+ ent = xa_load(&mock->pfns, cur / page_size);
+ if (ent) {
+ unsigned long val;
+
+ val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA;
+ old = xa_store(&mock->pfns, cur / page_size,
+ xa_mk_value(val), GFP_KERNEL);
+ WARN_ON_ONCE(ent != old);
+ count++;
+ }
+ }
+
+ cmd->dirty.out_nr_dirty = count;
+ rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+out_free:
+ kvfree(tmp);
+out_put:
+ iommufd_put_object(&hwpt->obj);
+ return rc;
+}
+
void iommufd_selftest_destroy(struct iommufd_object *obj)
{
struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1000,6 +1277,7 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
cmd->add_reserved.start,
cmd->add_reserved.length);
case IOMMU_TEST_OP_MOCK_DOMAIN:
+ case IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS:
return iommufd_test_mock_domain(ucmd, cmd);
case IOMMU_TEST_OP_MOCK_DOMAIN_REPLACE:
return iommufd_test_mock_domain_replace(
@@ -1041,6 +1319,12 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
return -EINVAL;
iommufd_test_memory_limit = cmd->memory_limit.limit;
return 0;
+ case IOMMU_TEST_OP_DIRTY:
+ return iommufd_test_dirty(ucmd, cmd->id, cmd->dirty.iova,
+ cmd->dirty.length,
+ cmd->dirty.page_size,
+ u64_to_user_ptr(cmd->dirty.uptr),
+ cmd->dirty.flags);
default:
return -EOPNOTSUPP;
}
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index 6c810bf80f99..538fbf76354d 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -255,7 +255,7 @@ err_put:
static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
{
- struct iommufd_hw_pagetable *hwpt;
+ struct iommufd_hwpt_paging *hwpt_paging;
struct iommufd_ioas *ioas;
int rc = 1;
@@ -264,8 +264,8 @@ static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
return PTR_ERR(ioas);
mutex_lock(&ioas->mutex);
- list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
- if (!hwpt->enforce_cache_coherency) {
+ list_for_each_entry(hwpt_paging, &ioas->hwpt_list, hwpt_item) {
+ if (!hwpt_paging->enforce_cache_coherency) {
rc = 0;
break;
}