diff options
Diffstat (limited to 'drivers')
60 files changed, 9232 insertions, 1417 deletions
diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index a9b96b18772f..e13e92609943 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -6,7 +6,6 @@ #include <linux/pci.h> #include <linux/device.h> #include <linux/sched/task.h> -#include <linux/intel-svm.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/cdev.h> #include <linux/fs.h> @@ -100,7 +99,7 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) filp->private_data = ctx; if (device_user_pasid_enabled(idxd)) { - sva = iommu_sva_bind_device(dev, current->mm, NULL); + sva = iommu_sva_bind_device(dev, current->mm); if (IS_ERR(sva)) { rc = PTR_ERR(sva); dev_err(dev, "pasid allocation failed: %d\n", rc); diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 09cbf0c179ba..529ea09c9094 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -14,7 +14,6 @@ #include <linux/io-64-nonatomic-lo-hi.h> #include <linux/device.h> #include <linux/idr.h> -#include <linux/intel-svm.h> #include <linux/iommu.h> #include <uapi/linux/idxd.h> #include <linux/dmaengine.h> @@ -502,29 +501,7 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d static int idxd_enable_system_pasid(struct idxd_device *idxd) { - int flags; - unsigned int pasid; - struct iommu_sva *sva; - - flags = SVM_FLAG_SUPERVISOR_MODE; - - sva = iommu_sva_bind_device(&idxd->pdev->dev, NULL, &flags); - if (IS_ERR(sva)) { - dev_warn(&idxd->pdev->dev, - "iommu sva bind failed: %ld\n", PTR_ERR(sva)); - return PTR_ERR(sva); - } - - pasid = iommu_sva_get_pasid(sva); - if (pasid == IOMMU_PASID_INVALID) { - iommu_sva_unbind_device(sva); - return -ENODEV; - } - - idxd->sva = sva; - idxd->pasid = pasid; - dev_dbg(&idxd->pdev->dev, "system pasid: %u\n", pasid); - return 0; + return -EOPNOTSUPP; } static void idxd_disable_system_pasid(struct idxd_device *idxd) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 077892a9aa8f..539447eda665 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -669,9 +669,6 @@ static int intel_vgpu_open_device(struct vfio_device *vfio_dev) vgpu->attached = true; - kvmgt_protect_table_init(vgpu); - gvt_cache_init(vgpu); - vgpu->track_node.track_write = kvmgt_page_track_write; vgpu->track_node.track_flush_slot = kvmgt_page_track_flush_slot; kvm_get_kvm(vgpu->vfio_device.kvm); @@ -715,6 +712,11 @@ static void intel_vgpu_close_device(struct vfio_device *vfio_dev) kvmgt_protect_table_destroy(vgpu); gvt_cache_destroy(vgpu); + WARN_ON(vgpu->nr_cache_entries); + + vgpu->gfn_cache = RB_ROOT; + vgpu->dma_addr_cache = RB_ROOT; + intel_vgpu_release_msi_eventfd_ctx(vgpu); vgpu->attached = false; @@ -1445,9 +1447,17 @@ static int intel_vgpu_init_dev(struct vfio_device *vfio_dev) struct intel_vgpu *vgpu = vfio_dev_to_vgpu(vfio_dev); struct intel_vgpu_type *type = container_of(mdev->type, struct intel_vgpu_type, type); + int ret; vgpu->gvt = kdev_to_i915(mdev->type->parent->dev)->gvt; - return intel_gvt_create_vgpu(vgpu, type->conf); + ret = intel_gvt_create_vgpu(vgpu, type->conf); + if (ret) + return ret; + + kvmgt_protect_table_init(vgpu); + gvt_cache_init(vgpu); + + return 0; } static void intel_vgpu_release_dev(struct vfio_device *vfio_dev) @@ -1468,6 +1478,9 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .mmap = intel_vgpu_mmap, .ioctl = intel_vgpu_ioctl, .dma_unmap = intel_vgpu_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static int intel_vgpu_probe(struct mdev_device *mdev) diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 453da65262af..79707685d54a 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -188,6 +188,7 @@ config MSM_IOMMU source "drivers/iommu/amd/Kconfig" source "drivers/iommu/intel/Kconfig" +source "drivers/iommu/iommufd/Kconfig" config IRQ_REMAP bool "Support for Interrupt Remapping" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index cc9f381013c3..f461d0651385 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y += amd/ intel/ arm/ +obj-y += amd/ intel/ arm/ iommufd/ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o @@ -28,6 +28,6 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o obj-$(CONFIG_S390_IOMMU) += s390-iommu.o obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o -obj-$(CONFIG_IOMMU_SVA) += iommu-sva-lib.o io-pgfault.o +obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o obj-$(CONFIG_APPLE_DART) += apple-dart.o diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 4d28967f910d..8d37d9087fab 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -2155,21 +2155,13 @@ static void amd_iommu_detach_device(struct iommu_domain *dom, static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev) { + struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); struct protection_domain *domain = to_pdomain(dom); - struct iommu_dev_data *dev_data; - struct amd_iommu *iommu; + struct amd_iommu *iommu = rlookup_amd_iommu(dev); int ret; - if (!check_device(dev)) - return -EINVAL; - - dev_data = dev_iommu_priv_get(dev); dev_data->defer_attach = false; - iommu = rlookup_amd_iommu(dev); - if (!iommu) - return -EINVAL; - if (dev_data->domain) detach_device(dev); @@ -2286,6 +2278,8 @@ static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) return false; case IOMMU_CAP_PRE_BOOT_PROTECTION: return amdr_ivrs_remap_support; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return true; default: break; } diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 5968a568aae2..a5a63b1c947e 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -10,7 +10,7 @@ #include <linux/slab.h> #include "arm-smmu-v3.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" #include "../../io-pgtable-arm.h" struct arm_smmu_mmu_notifier { @@ -344,11 +344,6 @@ __arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm) if (!bond) return ERR_PTR(-ENOMEM); - /* Allocate a PASID for this mm if necessary */ - ret = iommu_sva_alloc_pasid(mm, 1, (1U << master->ssid_bits) - 1); - if (ret) - goto err_free_bond; - bond->mm = mm; bond->sva.dev = dev; refcount_set(&bond->refs, 1); @@ -367,42 +362,6 @@ err_free_bond: return ERR_PTR(ret); } -struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct iommu_sva *handle; - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); - - if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1) - return ERR_PTR(-EINVAL); - - mutex_lock(&sva_lock); - handle = __arm_smmu_sva_bind(dev, mm); - mutex_unlock(&sva_lock); - return handle; -} - -void arm_smmu_sva_unbind(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - mutex_lock(&sva_lock); - if (refcount_dec_and_test(&bond->refs)) { - list_del(&bond->list); - arm_smmu_mmu_notifier_put(bond->smmu_mn); - kfree(bond); - } - mutex_unlock(&sva_lock); -} - -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) -{ - struct arm_smmu_bond *bond = sva_to_bond(handle); - - return bond->mm->pasid; -} - bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { unsigned long reg, fld; @@ -550,3 +509,64 @@ void arm_smmu_sva_notifier_synchronize(void) */ mmu_notifier_synchronize(); } + +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + struct mm_struct *mm = domain->mm; + struct arm_smmu_bond *bond = NULL, *t; + struct arm_smmu_master *master = dev_iommu_priv_get(dev); + + mutex_lock(&sva_lock); + list_for_each_entry(t, &master->bonds, list) { + if (t->mm == mm) { + bond = t; + break; + } + } + + if (!WARN_ON(!bond) && refcount_dec_and_test(&bond->refs)) { + list_del(&bond->list); + arm_smmu_mmu_notifier_put(bond->smmu_mn); + kfree(bond); + } + mutex_unlock(&sva_lock); +} + +static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id) +{ + int ret = 0; + struct iommu_sva *handle; + struct mm_struct *mm = domain->mm; + + mutex_lock(&sva_lock); + handle = __arm_smmu_sva_bind(dev, mm); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + mutex_unlock(&sva_lock); + + return ret; +} + +static void arm_smmu_sva_domain_free(struct iommu_domain *domain) +{ + kfree(domain); +} + +static const struct iommu_domain_ops arm_smmu_sva_domain_ops = { + .set_dev_pasid = arm_smmu_sva_set_dev_pasid, + .free = arm_smmu_sva_domain_free +}; + +struct iommu_domain *arm_smmu_sva_domain_alloc(void) +{ + struct iommu_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->ops = &arm_smmu_sva_domain_ops; + + return domain; +} diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 6d5df91c5c46..ab160198edd6 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -29,7 +29,7 @@ #include "arm-smmu-v3.h" #include "../../dma-iommu.h" -#include "../../iommu-sva-lib.h" +#include "../../iommu-sva.h" static bool disable_bypass = true; module_param(disable_bypass, bool, 0444); @@ -2009,6 +2009,9 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) { struct arm_smmu_domain *smmu_domain; + if (type == IOMMU_DOMAIN_SVA) + return arm_smmu_sva_domain_alloc(); + if (type != IOMMU_DOMAIN_UNMANAGED && type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_DMA_FQ && @@ -2430,23 +2433,14 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) goto out_unlock; } } else if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s (upstream of %s)\n", - dev_name(smmu_domain->smmu->dev), - dev_name(smmu->dev)); - ret = -ENXIO; + ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->ssid_bits != smmu_domain->s1_cfg.s1cdmax) { - dev_err(dev, - "cannot attach to incompatible domain (%u SSID bits != %u)\n", - smmu_domain->s1_cfg.s1cdmax, master->ssid_bits); ret = -EINVAL; goto out_unlock; } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && smmu_domain->stall_enabled != master->stall_enabled) { - dev_err(dev, "cannot attach to stall-%s domain\n", - smmu_domain->stall_enabled ? "enabled" : "disabled"); ret = -EINVAL; goto out_unlock; } @@ -2838,6 +2832,17 @@ static int arm_smmu_def_domain_type(struct device *dev) return 0; } +static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA); + if (WARN_ON(IS_ERR(domain)) || !domain) + return; + + arm_smmu_sva_remove_dev_pasid(domain, dev, pasid); +} + static struct iommu_ops arm_smmu_ops = { .capable = arm_smmu_capable, .domain_alloc = arm_smmu_domain_alloc, @@ -2846,11 +2851,9 @@ static struct iommu_ops arm_smmu_ops = { .device_group = arm_smmu_device_group, .of_xlate = arm_smmu_of_xlate, .get_resv_regions = arm_smmu_get_resv_regions, + .remove_dev_pasid = arm_smmu_remove_dev_pasid, .dev_enable_feat = arm_smmu_dev_enable_feature, .dev_disable_feat = arm_smmu_dev_disable_feature, - .sva_bind = arm_smmu_sva_bind, - .sva_unbind = arm_smmu_sva_unbind, - .sva_get_pasid = arm_smmu_sva_get_pasid, .page_response = arm_smmu_page_response, .def_domain_type = arm_smmu_def_domain_type, .pgsize_bitmap = -1UL, /* Restricted during device attach */ @@ -3543,6 +3546,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu) /* SID/SSID sizes */ smmu->ssid_bits = FIELD_GET(IDR1_SSIDSIZE, reg); smmu->sid_bits = FIELD_GET(IDR1_SIDSIZE, reg); + smmu->iommu.max_pasids = 1UL << smmu->ssid_bits; /* * If the SMMU supports fewer bits than would fill a single L2 stream diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h index cd48590ada30..8d772ea8a583 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h @@ -754,11 +754,10 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master); int arm_smmu_master_enable_sva(struct arm_smmu_master *master); int arm_smmu_master_disable_sva(struct arm_smmu_master *master); bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master); -struct iommu_sva *arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); -void arm_smmu_sva_unbind(struct iommu_sva *handle); -u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle); void arm_smmu_sva_notifier_synchronize(void); +struct iommu_domain *arm_smmu_sva_domain_alloc(void); +void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t id); #else /* CONFIG_ARM_SMMU_V3_SVA */ static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu) { @@ -790,19 +789,17 @@ static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master return false; } -static inline struct iommu_sva * -arm_smmu_sva_bind(struct device *dev, struct mm_struct *mm, void *drvdata) +static inline void arm_smmu_sva_notifier_synchronize(void) {} + +static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void) { - return ERR_PTR(-ENODEV); + return NULL; } -static inline void arm_smmu_sva_unbind(struct iommu_sva *handle) {} - -static inline u32 arm_smmu_sva_get_pasid(struct iommu_sva *handle) +static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain, + struct device *dev, + ioasid_t id) { - return IOMMU_PASID_INVALID; } - -static inline void arm_smmu_sva_notifier_synchronize(void) {} #endif /* CONFIG_ARM_SMMU_V3_SVA */ #endif /* _ARM_SMMU_V3_H */ diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index 30dab1418e3f..719fbca1fe52 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1150,9 +1150,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) * different SMMUs. */ if (smmu_domain->smmu != smmu) { - dev_err(dev, - "cannot attach to SMMU %s whilst already attached to domain on SMMU %s\n", - dev_name(smmu_domain->smmu->dev), dev_name(smmu->dev)); ret = -EINVAL; goto rpm_put; } diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c index 3869c3ecda8c..bfd7b51eb5db 100644 --- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c +++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c @@ -381,13 +381,8 @@ static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev * Sanity check the domain. We don't support domains across * different IOMMUs. */ - if (qcom_domain->iommu != qcom_iommu) { - dev_err(dev, "cannot attach to IOMMU %s while already " - "attached to domain on IOMMU %s\n", - dev_name(qcom_domain->iommu->dev), - dev_name(qcom_iommu->dev)); + if (qcom_domain->iommu != qcom_iommu) return -EINVAL; - } return 0; } diff --git a/drivers/iommu/fsl_pamu.c b/drivers/iommu/fsl_pamu.c index 0d03f837a5d4..2eb3211c8167 100644 --- a/drivers/iommu/fsl_pamu.c +++ b/drivers/iommu/fsl_pamu.c @@ -211,7 +211,7 @@ int pamu_config_ppaace(int liodn, u32 omi, u32 stashid, int prot) ppaace->op_encode.index_ot.omi = omi; } else if (~omi != 0) { pr_debug("bad operation mapping index: %d\n", omi); - return -EINVAL; + return -ENODEV; } /* configure stash id */ diff --git a/drivers/iommu/fsl_pamu_domain.c b/drivers/iommu/fsl_pamu_domain.c index fa20f4b03e12..4408ac3c49b6 100644 --- a/drivers/iommu/fsl_pamu_domain.c +++ b/drivers/iommu/fsl_pamu_domain.c @@ -258,7 +258,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, liodn = of_get_property(dev->of_node, "fsl,liodn", &len); if (!liodn) { pr_debug("missing fsl,liodn property at %pOF\n", dev->of_node); - return -EINVAL; + return -ENODEV; } spin_lock_irqsave(&dma_domain->domain_lock, flags); @@ -267,7 +267,7 @@ static int fsl_pamu_attach_device(struct iommu_domain *domain, if (liodn[i] >= PAACE_NUMBER_ENTRIES) { pr_debug("Invalid liodn %d, attach device failed for %pOF\n", liodn[i], dev->of_node); - ret = -EINVAL; + ret = -ENODEV; break; } diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c index bc94059a5b87..b00a0ceb2d13 100644 --- a/drivers/iommu/intel/dmar.c +++ b/drivers/iommu/intel/dmar.c @@ -1106,6 +1106,13 @@ static int alloc_iommu(struct dmar_drhd_unit *drhd) raw_spin_lock_init(&iommu->register_lock); /* + * A value of N in PSS field of eCap register indicates hardware + * supports PASID field of N+1 bits. + */ + if (pasid_supported(iommu)) + iommu->iommu.max_pasids = 2UL << ecap_pss(iommu->ecap); + + /* * This is only for hotplug; at boot time intel_iommu_enabled won't * be set yet. When intel_iommu_init() runs, it registers the units * present at boot time, then sets intel_iommu_enabled. diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 644ca49e8cf8..bef8e8f7ca25 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -27,7 +27,7 @@ #include "iommu.h" #include "../dma-iommu.h" #include "../irq_remapping.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "pasid.h" #include "cap_audit.h" @@ -4188,6 +4188,8 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type) return domain; case IOMMU_DOMAIN_IDENTITY: return &si_domain->domain; + case IOMMU_DOMAIN_SVA: + return intel_svm_domain_alloc(); default: return NULL; } @@ -4213,19 +4215,15 @@ static int prepare_domain_attach_device(struct iommu_domain *domain, return -ENODEV; if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) - return -EOPNOTSUPP; + return -EINVAL; /* check if this iommu agaw is sufficient for max mapped address */ addr_width = agaw_to_width(iommu->agaw); if (addr_width > cap_mgaw(iommu->cap)) addr_width = cap_mgaw(iommu->cap); - if (dmar_domain->max_addr > (1LL << addr_width)) { - dev_err(dev, "%s: iommu width (%d) is not " - "sufficient for the mapped address (%llx)\n", - __func__, addr_width, dmar_domain->max_addr); - return -EFAULT; - } + if (dmar_domain->max_addr > (1LL << addr_width)) + return -EINVAL; dmar_domain->gaw = addr_width; /* @@ -4471,14 +4469,20 @@ static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain) static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap) { - if (cap == IOMMU_CAP_CACHE_COHERENCY) + struct device_domain_info *info = dev_iommu_priv_get(dev); + + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: return true; - if (cap == IOMMU_CAP_INTR_REMAP) + case IOMMU_CAP_INTR_REMAP: return irq_remapping_enabled == 1; - if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION) + case IOMMU_CAP_PRE_BOOT_PROTECTION: return dmar_platform_optin(); - - return false; + case IOMMU_CAP_ENFORCE_CACHE_COHERENCY: + return ecap_sc_support(info->iommu->ecap); + default: + return false; + } } static struct iommu_device *intel_iommu_probe_device(struct device *dev) @@ -4732,6 +4736,28 @@ static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain, __mapping_notify_one(info->iommu, dmar_domain, pfn, pages); } +static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); + struct iommu_domain *domain; + + /* Domain type specific cleanup: */ + domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0); + if (domain) { + switch (domain->type) { + case IOMMU_DOMAIN_SVA: + intel_svm_remove_dev_pasid(dev, pasid); + break; + default: + /* should never reach here */ + WARN_ON(1); + break; + } + } + + intel_pasid_tear_down_entry(iommu, dev, pasid, false); +} + const struct iommu_ops intel_iommu_ops = { .capable = intel_iommu_capable, .domain_alloc = intel_iommu_domain_alloc, @@ -4744,11 +4770,9 @@ const struct iommu_ops intel_iommu_ops = { .dev_disable_feat = intel_iommu_dev_disable_feat, .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, + .remove_dev_pasid = intel_iommu_remove_dev_pasid, .pgsize_bitmap = SZ_4K, #ifdef CONFIG_INTEL_IOMMU_SVM - .sva_bind = intel_svm_bind, - .sva_unbind = intel_svm_unbind, - .sva_get_pasid = intel_svm_get_pasid, .page_response = intel_svm_page_response, #endif .default_domain_ops = &(const struct iommu_domain_ops) { diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index b3d82d7093e6..f83ad8ddcf4d 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -480,8 +480,6 @@ enum { #define VTD_FLAG_IRQ_REMAP_PRE_ENABLED (1 << 1) #define VTD_FLAG_SVM_CAPABLE (1 << 2) -extern int intel_iommu_sm; - #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap)) #define pasid_supported(iommu) (sm_supported(iommu) && \ ecap_pasid((iommu)->ecap)) @@ -753,12 +751,10 @@ struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn); extern void intel_svm_check(struct intel_iommu *iommu); extern int intel_svm_enable_prq(struct intel_iommu *iommu); extern int intel_svm_finish_prq(struct intel_iommu *iommu); -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, - void *drvdata); -void intel_svm_unbind(struct iommu_sva *handle); -u32 intel_svm_get_pasid(struct iommu_sva *handle); int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg); +struct iommu_domain *intel_svm_domain_alloc(void); +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid); struct intel_svm_dev { struct list_head list; @@ -783,6 +779,14 @@ struct intel_svm { }; #else static inline void intel_svm_check(struct intel_iommu *iommu) {} +static inline struct iommu_domain *intel_svm_domain_alloc(void) +{ + return NULL; +} + +static inline void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ +} #endif #ifdef CONFIG_INTEL_IOMMU_DEBUGFS @@ -798,6 +802,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, extern const struct iommu_ops intel_iommu_ops; #ifdef CONFIG_INTEL_IOMMU +extern int intel_iommu_sm; extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; @@ -813,6 +818,7 @@ static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu) } #define dmar_disabled (1) #define intel_iommu_enabled (0) +#define intel_iommu_sm (0) #endif static inline const char *decode_prq_descriptor(char *str, size_t size, diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index e13d7e5273e1..fb3c7020028d 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -101,8 +101,10 @@ int intel_pasid_alloc_table(struct device *dev) might_sleep(); info = dev_iommu_priv_get(dev); - if (WARN_ON(!info || !dev_is_pci(dev) || info->pasid_table)) - return -EINVAL; + if (WARN_ON(!info || !dev_is_pci(dev))) + return -ENODEV; + if (WARN_ON(info->pasid_table)) + return -EEXIST; pasid_table = kzalloc(sizeof(*pasid_table), GFP_KERNEL); if (!pasid_table) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 03b25358946c..c76b66263467 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -24,7 +24,7 @@ #include "iommu.h" #include "pasid.h" #include "perf.h" -#include "../iommu-sva-lib.h" +#include "../iommu-sva.h" #include "trace.h" static irqreturn_t prq_event_thread(int irq, void *d); @@ -299,19 +299,9 @@ out: return 0; } -static int intel_svm_alloc_pasid(struct device *dev, struct mm_struct *mm, - unsigned int flags) -{ - ioasid_t max_pasid = dev_is_pci(dev) ? - pci_max_pasids(to_pci_dev(dev)) : intel_pasid_max_id; - - return iommu_sva_alloc_pasid(mm, PASID_MIN, max_pasid - 1); -} - static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, struct device *dev, - struct mm_struct *mm, - unsigned int flags) + struct mm_struct *mm) { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_svm_dev *sdev; @@ -327,22 +317,18 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, svm->pasid = mm->pasid; svm->mm = mm; - svm->flags = flags; INIT_LIST_HEAD_RCU(&svm->devs); - if (!(flags & SVM_FLAG_SUPERVISOR_MODE)) { - svm->notifier.ops = &intel_mmuops; - ret = mmu_notifier_register(&svm->notifier, mm); - if (ret) { - kfree(svm); - return ERR_PTR(ret); - } + svm->notifier.ops = &intel_mmuops; + ret = mmu_notifier_register(&svm->notifier, mm); + if (ret) { + kfree(svm); + return ERR_PTR(ret); } ret = pasid_private_add(svm->pasid, svm); if (ret) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); kfree(svm); return ERR_PTR(ret); } @@ -377,9 +363,7 @@ static struct iommu_sva *intel_svm_bind_mm(struct intel_iommu *iommu, } /* Setup the pasid table: */ - sflags = (flags & SVM_FLAG_SUPERVISOR_MODE) ? - PASID_FLAG_SUPERVISOR_MODE : 0; - sflags |= cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; + sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; ret = intel_pasid_setup_first_level(iommu, dev, mm->pgd, mm->pasid, FLPT_DEFAULT_DID, sflags); if (ret) @@ -393,8 +377,7 @@ free_sdev: kfree(sdev); free_svm: if (list_empty(&svm->devs)) { - if (svm->notifier.ops) - mmu_notifier_unregister(&svm->notifier, mm); + mmu_notifier_unregister(&svm->notifier, mm); pasid_private_remove(mm->pasid); kfree(svm); } @@ -787,67 +770,6 @@ prq_advance: return IRQ_RETVAL(handled); } -struct iommu_sva *intel_svm_bind(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL); - unsigned int flags = 0; - struct iommu_sva *sva; - int ret; - - if (drvdata) - flags = *(unsigned int *)drvdata; - - if (flags & SVM_FLAG_SUPERVISOR_MODE) { - if (!ecap_srs(iommu->ecap)) { - dev_err(dev, "%s: Supervisor PASID not supported\n", - iommu->name); - return ERR_PTR(-EOPNOTSUPP); - } - - if (mm) { - dev_err(dev, "%s: Supervisor PASID with user provided mm\n", - iommu->name); - return ERR_PTR(-EINVAL); - } - - mm = &init_mm; - } - - mutex_lock(&pasid_mutex); - ret = intel_svm_alloc_pasid(dev, mm, flags); - if (ret) { - mutex_unlock(&pasid_mutex); - return ERR_PTR(ret); - } - - sva = intel_svm_bind_mm(iommu, dev, mm, flags); - mutex_unlock(&pasid_mutex); - - return sva; -} - -void intel_svm_unbind(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev = to_intel_svm_dev(sva); - - mutex_lock(&pasid_mutex); - intel_svm_unbind_mm(sdev->dev, sdev->pasid); - mutex_unlock(&pasid_mutex); -} - -u32 intel_svm_get_pasid(struct iommu_sva *sva) -{ - struct intel_svm_dev *sdev; - u32 pasid; - - mutex_lock(&pasid_mutex); - sdev = to_intel_svm_dev(sva); - pasid = sdev->pasid; - mutex_unlock(&pasid_mutex); - - return pasid; -} - int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt, struct iommu_page_response *msg) @@ -918,3 +840,50 @@ int intel_svm_page_response(struct device *dev, out: return ret; } + +void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid) +{ + mutex_lock(&pasid_mutex); + intel_svm_unbind_mm(dev, pasid); + mutex_unlock(&pasid_mutex); +} + +static int intel_svm_set_dev_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct intel_iommu *iommu = info->iommu; + struct mm_struct *mm = domain->mm; + struct iommu_sva *sva; + int ret = 0; + + mutex_lock(&pasid_mutex); + sva = intel_svm_bind_mm(iommu, dev, mm); + if (IS_ERR(sva)) + ret = PTR_ERR(sva); + mutex_unlock(&pasid_mutex); + + return ret; +} + +static void intel_svm_domain_free(struct iommu_domain *domain) +{ + kfree(to_dmar_domain(domain)); +} + +static const struct iommu_domain_ops intel_svm_domain_ops = { + .set_dev_pasid = intel_svm_set_dev_pasid, + .free = intel_svm_domain_free +}; + +struct iommu_domain *intel_svm_domain_alloc(void) +{ + struct dmar_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + domain->domain.ops = &intel_svm_domain_ops; + + return &domain->domain; +} diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 1df8c1dcae77..e5b8b9110c13 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -11,7 +11,7 @@ #include <linux/slab.h> #include <linux/workqueue.h> -#include "iommu-sva-lib.h" +#include "iommu-sva.h" /** * struct iopf_queue - IO Page Fault queue @@ -69,69 +69,18 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf, return iommu_page_response(dev, &resp); } -static enum iommu_page_response_code -iopf_handle_single(struct iopf_fault *iopf) -{ - vm_fault_t ret; - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned int access_flags = 0; - unsigned int fault_flags = FAULT_FLAG_REMOTE; - struct iommu_fault_page_request *prm = &iopf->fault.prm; - enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; - - if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) - return status; - - mm = iommu_sva_find(prm->pasid); - if (IS_ERR_OR_NULL(mm)) - return status; - - mmap_read_lock(mm); - - vma = find_extend_vma(mm, prm->addr); - if (!vma) - /* Unmapped area */ - goto out_put_mm; - - if (prm->perm & IOMMU_FAULT_PERM_READ) - access_flags |= VM_READ; - - if (prm->perm & IOMMU_FAULT_PERM_WRITE) { - access_flags |= VM_WRITE; - fault_flags |= FAULT_FLAG_WRITE; - } - - if (prm->perm & IOMMU_FAULT_PERM_EXEC) { - access_flags |= VM_EXEC; - fault_flags |= FAULT_FLAG_INSTRUCTION; - } - - if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) - fault_flags |= FAULT_FLAG_USER; - - if (access_flags & ~vma->vm_flags) - /* Access fault */ - goto out_put_mm; - - ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); - status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : - IOMMU_PAGE_RESP_SUCCESS; - -out_put_mm: - mmap_read_unlock(mm); - mmput(mm); - - return status; -} - -static void iopf_handle_group(struct work_struct *work) +static void iopf_handler(struct work_struct *work) { struct iopf_group *group; + struct iommu_domain *domain; struct iopf_fault *iopf, *next; enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS; group = container_of(work, struct iopf_group, work); + domain = iommu_get_domain_for_dev_pasid(group->dev, + group->last_fault.fault.prm.pasid, 0); + if (!domain || !domain->iopf_handler) + status = IOMMU_PAGE_RESP_INVALID; list_for_each_entry_safe(iopf, next, &group->faults, list) { /* @@ -139,7 +88,8 @@ static void iopf_handle_group(struct work_struct *work) * faults in the group if there is an error. */ if (status == IOMMU_PAGE_RESP_SUCCESS) - status = iopf_handle_single(iopf); + status = domain->iopf_handler(&iopf->fault, + domain->fault_data); if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) @@ -181,6 +131,13 @@ static void iopf_handle_group(struct work_struct *work) * request completes, outstanding faults will have been dealt with by the time * the PASID is freed. * + * Any valid page fault will be eventually routed to an iommu domain and the + * page fault handler installed there will get called. The users of this + * handling framework should guarantee that the iommu domain could only be + * freed after the device has stopped generating page faults (or the iommu + * hardware has been set to block the page faults) and the pending page faults + * have been flushed. + * * Return: 0 on success and <0 on error. */ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) @@ -235,7 +192,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) group->last_fault.fault = *fault; INIT_LIST_HEAD(&group->faults); list_add(&group->last_fault.list, &group->faults); - INIT_WORK(&group->work, iopf_handle_group); + INIT_WORK(&group->work, iopf_handler); /* See if we have partial faults for this group */ list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) { diff --git a/drivers/iommu/iommu-sva-lib.c b/drivers/iommu/iommu-sva-lib.c deleted file mode 100644 index 106506143896..000000000000 --- a/drivers/iommu/iommu-sva-lib.c +++ /dev/null @@ -1,71 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Helpers for IOMMU drivers implementing SVA - */ -#include <linux/mutex.h> -#include <linux/sched/mm.h> - -#include "iommu-sva-lib.h" - -static DEFINE_MUTEX(iommu_sva_lock); -static DECLARE_IOASID_SET(iommu_sva_pasid); - -/** - * iommu_sva_alloc_pasid - Allocate a PASID for the mm - * @mm: the mm - * @min: minimum PASID value (inclusive) - * @max: maximum PASID value (inclusive) - * - * Try to allocate a PASID for this mm, or take a reference to the existing one - * provided it fits within the [@min, @max] range. On success the PASID is - * available in mm->pasid and will be available for the lifetime of the mm. - * - * Returns 0 on success and < 0 on error. - */ -int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) -{ - int ret = 0; - ioasid_t pasid; - - if (min == INVALID_IOASID || max == INVALID_IOASID || - min == 0 || max < min) - return -EINVAL; - - mutex_lock(&iommu_sva_lock); - /* Is a PASID already associated with this mm? */ - if (pasid_valid(mm->pasid)) { - if (mm->pasid < min || mm->pasid >= max) - ret = -EOVERFLOW; - goto out; - } - - pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); - if (!pasid_valid(pasid)) - ret = -ENOMEM; - else - mm_pasid_set(mm, pasid); -out: - mutex_unlock(&iommu_sva_lock); - return ret; -} -EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid); - -/* ioasid_find getter() requires a void * argument */ -static bool __mmget_not_zero(void *mm) -{ - return mmget_not_zero(mm); -} - -/** - * iommu_sva_find() - Find mm associated to the given PASID - * @pasid: Process Address Space ID assigned to the mm - * - * On success a reference to the mm is taken, and must be released with mmput(). - * - * Returns the mm corresponding to this PASID, or an error if not found. - */ -struct mm_struct *iommu_sva_find(ioasid_t pasid) -{ - return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero); -} -EXPORT_SYMBOL_GPL(iommu_sva_find); diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c new file mode 100644 index 000000000000..24bf9b2b58aa --- /dev/null +++ b/drivers/iommu/iommu-sva.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Helpers for IOMMU drivers implementing SVA + */ +#include <linux/mutex.h> +#include <linux/sched/mm.h> +#include <linux/iommu.h> + +#include "iommu-sva.h" + +static DEFINE_MUTEX(iommu_sva_lock); +static DECLARE_IOASID_SET(iommu_sva_pasid); + +/** + * iommu_sva_alloc_pasid - Allocate a PASID for the mm + * @mm: the mm + * @min: minimum PASID value (inclusive) + * @max: maximum PASID value (inclusive) + * + * Try to allocate a PASID for this mm, or take a reference to the existing one + * provided it fits within the [@min, @max] range. On success the PASID is + * available in mm->pasid and will be available for the lifetime of the mm. + * + * Returns 0 on success and < 0 on error. + */ +int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t max) +{ + int ret = 0; + ioasid_t pasid; + + if (min == INVALID_IOASID || max == INVALID_IOASID || + min == 0 || max < min) + return -EINVAL; + + mutex_lock(&iommu_sva_lock); + /* Is a PASID already associated with this mm? */ + if (pasid_valid(mm->pasid)) { + if (mm->pasid < min || mm->pasid >= max) + ret = -EOVERFLOW; + goto out; + } + + pasid = ioasid_alloc(&iommu_sva_pasid, min, max, mm); + if (!pasid_valid(pasid)) + ret = -ENOMEM; + else + mm_pasid_set(mm, pasid); +out: + mutex_unlock(&iommu_sva_lock); + return ret; +} +EXPORT_SYMBOL_GPL(iommu_sva_alloc_pasid); + +/* ioasid_find getter() requires a void * argument */ +static bool __mmget_not_zero(void *mm) +{ + return mmget_not_zero(mm); +} + +/** + * iommu_sva_find() - Find mm associated to the given PASID + * @pasid: Process Address Space ID assigned to the mm + * + * On success a reference to the mm is taken, and must be released with mmput(). + * + * Returns the mm corresponding to this PASID, or an error if not found. + */ +struct mm_struct *iommu_sva_find(ioasid_t pasid) +{ + return ioasid_find(&iommu_sva_pasid, pasid, __mmget_not_zero); +} +EXPORT_SYMBOL_GPL(iommu_sva_find); + +/** + * iommu_sva_bind_device() - Bind a process address space to a device + * @dev: the device + * @mm: the mm to bind, caller must hold a reference to mm_users + * + * Create a bond between device and address space, allowing the device to + * access the mm using the PASID returned by iommu_sva_get_pasid(). If a + * bond already exists between @device and @mm, an additional internal + * reference is taken. Caller must call iommu_sva_unbind_device() + * to release each reference. + * + * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to + * initialize the required SVA features. + * + * On error, returns an ERR_PTR value. + */ +struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm) +{ + struct iommu_domain *domain; + struct iommu_sva *handle; + ioasid_t max_pasids; + int ret; + + max_pasids = dev->iommu->max_pasids; + if (!max_pasids) + return ERR_PTR(-EOPNOTSUPP); + + /* Allocate mm->pasid if necessary. */ + ret = iommu_sva_alloc_pasid(mm, 1, max_pasids - 1); + if (ret) + return ERR_PTR(ret); + + handle = kzalloc(sizeof(*handle), GFP_KERNEL); + if (!handle) + return ERR_PTR(-ENOMEM); + + mutex_lock(&iommu_sva_lock); + /* Search for an existing domain. */ + domain = iommu_get_domain_for_dev_pasid(dev, mm->pasid, + IOMMU_DOMAIN_SVA); + if (IS_ERR(domain)) { + ret = PTR_ERR(domain); + goto out_unlock; + } + + if (domain) { + domain->users++; + goto out; + } + + /* Allocate a new domain and set it on device pasid. */ + domain = iommu_sva_domain_alloc(dev, mm); + if (!domain) { + ret = -ENOMEM; + goto out_unlock; + } + + ret = iommu_attach_device_pasid(domain, dev, mm->pasid); + if (ret) + goto out_free_domain; + domain->users = 1; +out: + mutex_unlock(&iommu_sva_lock); + handle->dev = dev; + handle->domain = domain; + + return handle; + +out_free_domain: + iommu_domain_free(domain); +out_unlock: + mutex_unlock(&iommu_sva_lock); + kfree(handle); + + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(iommu_sva_bind_device); + +/** + * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device + * @handle: the handle returned by iommu_sva_bind_device() + * + * Put reference to a bond between device and address space. The device should + * not be issuing any more transaction for this PASID. All outstanding page + * requests for this PASID must have been flushed to the IOMMU. + */ +void iommu_sva_unbind_device(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + ioasid_t pasid = domain->mm->pasid; + struct device *dev = handle->dev; + + mutex_lock(&iommu_sva_lock); + if (--domain->users == 0) { + iommu_detach_device_pasid(domain, dev, pasid); + iommu_domain_free(domain); + } + mutex_unlock(&iommu_sva_lock); + kfree(handle); +} +EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); + +u32 iommu_sva_get_pasid(struct iommu_sva *handle) +{ + struct iommu_domain *domain = handle->domain; + + return domain->mm->pasid; +} +EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); + +/* + * I/O page fault handler for SVA + */ +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + vm_fault_t ret; + struct vm_area_struct *vma; + struct mm_struct *mm = data; + unsigned int access_flags = 0; + unsigned int fault_flags = FAULT_FLAG_REMOTE; + struct iommu_fault_page_request *prm = &fault->prm; + enum iommu_page_response_code status = IOMMU_PAGE_RESP_INVALID; + + if (!(prm->flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID)) + return status; + + if (!mmget_not_zero(mm)) + return status; + + mmap_read_lock(mm); + + vma = find_extend_vma(mm, prm->addr); + if (!vma) + /* Unmapped area */ + goto out_put_mm; + + if (prm->perm & IOMMU_FAULT_PERM_READ) + access_flags |= VM_READ; + + if (prm->perm & IOMMU_FAULT_PERM_WRITE) { + access_flags |= VM_WRITE; + fault_flags |= FAULT_FLAG_WRITE; + } + + if (prm->perm & IOMMU_FAULT_PERM_EXEC) { + access_flags |= VM_EXEC; + fault_flags |= FAULT_FLAG_INSTRUCTION; + } + + if (!(prm->perm & IOMMU_FAULT_PERM_PRIV)) + fault_flags |= FAULT_FLAG_USER; + + if (access_flags & ~vma->vm_flags) + /* Access fault */ + goto out_put_mm; + + ret = handle_mm_fault(vma, prm->addr, fault_flags, NULL); + status = ret & VM_FAULT_ERROR ? IOMMU_PAGE_RESP_INVALID : + IOMMU_PAGE_RESP_SUCCESS; + +out_put_mm: + mmap_read_unlock(mm); + mmput(mm); + + return status; +} diff --git a/drivers/iommu/iommu-sva-lib.h b/drivers/iommu/iommu-sva.h index 8909ea1094e3..7215a761b962 100644 --- a/drivers/iommu/iommu-sva-lib.h +++ b/drivers/iommu/iommu-sva.h @@ -2,8 +2,8 @@ /* * SVA library for IOMMU drivers */ -#ifndef _IOMMU_SVA_LIB_H -#define _IOMMU_SVA_LIB_H +#ifndef _IOMMU_SVA_H +#define _IOMMU_SVA_H #include <linux/ioasid.h> #include <linux/mm_types.h> @@ -26,6 +26,8 @@ int iopf_queue_flush_dev(struct device *dev); struct iopf_queue *iopf_queue_alloc(const char *name); void iopf_queue_free(struct iopf_queue *queue); int iopf_queue_discard_partial(struct iopf_queue *queue); +enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data); #else /* CONFIG_IOMMU_SVA */ static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie) @@ -63,5 +65,11 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue) { return -ENODEV; } + +static inline enum iommu_page_response_code +iommu_sva_handle_iopf(struct iommu_fault *fault, void *data) +{ + return IOMMU_PAGE_RESP_INVALID; +} #endif /* CONFIG_IOMMU_SVA */ -#endif /* _IOMMU_SVA_LIB_H */ +#endif /* _IOMMU_SVA_H */ diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 65a3b3d886dc..d69ebba81beb 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -21,6 +21,7 @@ #include <linux/idr.h> #include <linux/err.h> #include <linux/pci.h> +#include <linux/pci-ats.h> #include <linux/bitops.h> #include <linux/platform_device.h> #include <linux/property.h> @@ -28,9 +29,12 @@ #include <linux/module.h> #include <linux/cc_platform.h> #include <trace/events/iommu.h> +#include <linux/sched/mm.h> #include "dma-iommu.h" +#include "iommu-sva.h" + static struct kset *iommu_group_kset; static DEFINE_IDA(iommu_group_ida); @@ -42,6 +46,7 @@ struct iommu_group { struct kobject kobj; struct kobject *devices_kobj; struct list_head devices; + struct xarray pasid_array; struct mutex mutex; void *iommu_data; void (*iommu_data_release)(void *iommu_data); @@ -278,6 +283,24 @@ static void dev_iommu_free(struct device *dev) kfree(param); } +static u32 dev_iommu_get_max_pasids(struct device *dev) +{ + u32 max_pasids = 0, bits = 0; + int ret; + + if (dev_is_pci(dev)) { + ret = pci_max_pasids(to_pci_dev(dev)); + if (ret > 0) + max_pasids = ret; + } else { + ret = device_property_read_u32(dev, "pasid-num-bits", &bits); + if (!ret) + max_pasids = 1UL << bits; + } + + return min_t(u32, max_pasids, dev->iommu->iommu_dev->max_pasids); +} + static int __iommu_probe_device(struct device *dev, struct list_head *group_list) { const struct iommu_ops *ops = dev->bus->iommu_ops; @@ -303,6 +326,7 @@ static int __iommu_probe_device(struct device *dev, struct list_head *group_list } dev->iommu->iommu_dev = iommu_dev; + dev->iommu->max_pasids = dev_iommu_get_max_pasids(dev); group = iommu_group_get_for_dev(dev); if (IS_ERR(group)) { @@ -703,6 +727,7 @@ struct iommu_group *iommu_group_alloc(void) mutex_init(&group->mutex); INIT_LIST_HEAD(&group->devices); INIT_LIST_HEAD(&group->entry); + xa_init(&group->pasid_array); ret = ida_alloc(&iommu_group_ida, GFP_KERNEL); if (ret < 0) { @@ -1912,6 +1937,8 @@ EXPORT_SYMBOL_GPL(iommu_domain_alloc); void iommu_domain_free(struct iommu_domain *domain) { + if (domain->type == IOMMU_DOMAIN_SVA) + mmdrop(domain->mm); iommu_put_dma_cookie(domain); domain->ops->free(domain); } @@ -1949,6 +1976,18 @@ static int __iommu_attach_device(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_device - Attach an IOMMU domain to a device + * @domain: IOMMU domain to attach + * @dev: Device that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the device. In this case attaching a different domain to the + * device may succeed. + */ int iommu_attach_device(struct iommu_domain *domain, struct device *dev) { struct iommu_group *group; @@ -2075,6 +2114,18 @@ static int __iommu_attach_group(struct iommu_domain *domain, return ret; } +/** + * iommu_attach_group - Attach an IOMMU domain to an IOMMU group + * @domain: IOMMU domain to attach + * @group: IOMMU group that will be attached + * + * Returns 0 on success and error code on failure + * + * Note that EINVAL can be treated as a soft failure, indicating + * that certain configuration of the domain is incompatible with + * the group. In this case attaching a different domain to the + * group may succeed. + */ int iommu_attach_group(struct iommu_domain *domain, struct iommu_group *group) { int ret; @@ -2726,98 +2777,6 @@ int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat) } EXPORT_SYMBOL_GPL(iommu_dev_disable_feature); -/** - * iommu_sva_bind_device() - Bind a process address space to a device - * @dev: the device - * @mm: the mm to bind, caller must hold a reference to it - * @drvdata: opaque data pointer to pass to bind callback - * - * Create a bond between device and address space, allowing the device to access - * the mm using the returned PASID. If a bond already exists between @device and - * @mm, it is returned and an additional reference is taken. Caller must call - * iommu_sva_unbind_device() to release each reference. - * - * iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_SVA) must be called first, to - * initialize the required SVA features. - * - * On error, returns an ERR_PTR value. - */ -struct iommu_sva * -iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, void *drvdata) -{ - struct iommu_group *group; - struct iommu_sva *handle = ERR_PTR(-EINVAL); - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_bind) - return ERR_PTR(-ENODEV); - - group = iommu_group_get(dev); - if (!group) - return ERR_PTR(-ENODEV); - - /* Ensure device count and domain don't change while we're binding */ - mutex_lock(&group->mutex); - - /* - * To keep things simple, SVA currently doesn't support IOMMU groups - * with more than one device. Existing SVA-capable systems are not - * affected by the problems that required IOMMU groups (lack of ACS - * isolation, device ID aliasing and other hardware issues). - */ - if (iommu_group_device_count(group) != 1) - goto out_unlock; - - handle = ops->sva_bind(dev, mm, drvdata); - -out_unlock: - mutex_unlock(&group->mutex); - iommu_group_put(group); - - return handle; -} -EXPORT_SYMBOL_GPL(iommu_sva_bind_device); - -/** - * iommu_sva_unbind_device() - Remove a bond created with iommu_sva_bind_device - * @handle: the handle returned by iommu_sva_bind_device() - * - * Put reference to a bond between device and address space. The device should - * not be issuing any more transaction for this PASID. All outstanding page - * requests for this PASID must have been flushed to the IOMMU. - */ -void iommu_sva_unbind_device(struct iommu_sva *handle) -{ - struct iommu_group *group; - struct device *dev = handle->dev; - const struct iommu_ops *ops = dev_iommu_ops(dev); - - if (!ops->sva_unbind) - return; - - group = iommu_group_get(dev); - if (!group) - return; - - mutex_lock(&group->mutex); - ops->sva_unbind(handle); - mutex_unlock(&group->mutex); - - iommu_group_put(group); -} -EXPORT_SYMBOL_GPL(iommu_sva_unbind_device); - -u32 iommu_sva_get_pasid(struct iommu_sva *handle) -{ - const struct iommu_ops *ops = dev_iommu_ops(handle->dev); - - if (!ops->sva_get_pasid) - return IOMMU_PASID_INVALID; - - return ops->sva_get_pasid(handle); -} -EXPORT_SYMBOL_GPL(iommu_sva_get_pasid); - /* * Changes the default domain of an iommu group that has *only* one device * @@ -3087,7 +3046,8 @@ int iommu_device_use_default_domain(struct device *dev) mutex_lock(&group->mutex); if (group->owner_cnt) { - if (group->owner || !iommu_is_default_domain(group)) { + if (group->owner || !iommu_is_default_domain(group) || + !xa_empty(&group->pasid_array)) { ret = -EBUSY; goto unlock_out; } @@ -3118,7 +3078,7 @@ void iommu_device_unuse_default_domain(struct device *dev) return; mutex_lock(&group->mutex); - if (!WARN_ON(!group->owner_cnt)) + if (!WARN_ON(!group->owner_cnt || !xa_empty(&group->pasid_array))) group->owner_cnt--; mutex_unlock(&group->mutex); @@ -3148,40 +3108,49 @@ static int __iommu_group_alloc_blocking_domain(struct iommu_group *group) return 0; } +static int __iommu_take_dma_ownership(struct iommu_group *group, void *owner) +{ + int ret; + + if ((group->domain && group->domain != group->default_domain) || + !xa_empty(&group->pasid_array)) + return -EBUSY; + + ret = __iommu_group_alloc_blocking_domain(group); + if (ret) + return ret; + ret = __iommu_group_set_domain(group, group->blocking_domain); + if (ret) + return ret; + + group->owner = owner; + group->owner_cnt++; + return 0; +} + /** * iommu_group_claim_dma_owner() - Set DMA ownership of a group * @group: The group. * @owner: Caller specified pointer. Used for exclusive ownership. * - * This is to support backward compatibility for vfio which manages - * the dma ownership in iommu_group level. New invocations on this - * interface should be prohibited. + * This is to support backward compatibility for vfio which manages the dma + * ownership in iommu_group level. New invocations on this interface should be + * prohibited. Only a single owner may exist for a group. */ int iommu_group_claim_dma_owner(struct iommu_group *group, void *owner) { int ret = 0; + if (WARN_ON(!owner)) + return -EINVAL; + mutex_lock(&group->mutex); if (group->owner_cnt) { ret = -EPERM; goto unlock_out; - } else { - if (group->domain && group->domain != group->default_domain) { - ret = -EBUSY; - goto unlock_out; - } - - ret = __iommu_group_alloc_blocking_domain(group); - if (ret) - goto unlock_out; - - ret = __iommu_group_set_domain(group, group->blocking_domain); - if (ret) - goto unlock_out; - group->owner = owner; } - group->owner_cnt++; + ret = __iommu_take_dma_ownership(group, owner); unlock_out: mutex_unlock(&group->mutex); @@ -3190,30 +3159,92 @@ unlock_out: EXPORT_SYMBOL_GPL(iommu_group_claim_dma_owner); /** - * iommu_group_release_dma_owner() - Release DMA ownership of a group - * @group: The group. + * iommu_device_claim_dma_owner() - Set DMA ownership of a device + * @dev: The device. + * @owner: Caller specified pointer. Used for exclusive ownership. * - * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + * Claim the DMA ownership of a device. Multiple devices in the same group may + * concurrently claim ownership if they present the same owner value. Returns 0 + * on success and error code on failure */ -void iommu_group_release_dma_owner(struct iommu_group *group) +int iommu_device_claim_dma_owner(struct device *dev, void *owner) { - int ret; + struct iommu_group *group = iommu_group_get(dev); + int ret = 0; + + if (!group) + return -ENODEV; + if (WARN_ON(!owner)) + return -EINVAL; mutex_lock(&group->mutex); - if (WARN_ON(!group->owner_cnt || !group->owner)) + if (group->owner_cnt) { + if (group->owner != owner) { + ret = -EPERM; + goto unlock_out; + } + group->owner_cnt++; goto unlock_out; + } + + ret = __iommu_take_dma_ownership(group, owner); +unlock_out: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_device_claim_dma_owner); + +static void __iommu_release_dma_ownership(struct iommu_group *group) +{ + int ret; + + if (WARN_ON(!group->owner_cnt || !group->owner || + !xa_empty(&group->pasid_array))) + return; group->owner_cnt = 0; group->owner = NULL; ret = __iommu_group_set_domain(group, group->default_domain); WARN(ret, "iommu driver failed to attach the default domain"); +} -unlock_out: +/** + * iommu_group_release_dma_owner() - Release DMA ownership of a group + * @dev: The device + * + * Release the DMA ownership claimed by iommu_group_claim_dma_owner(). + */ +void iommu_group_release_dma_owner(struct iommu_group *group) +{ + mutex_lock(&group->mutex); + __iommu_release_dma_ownership(group); mutex_unlock(&group->mutex); } EXPORT_SYMBOL_GPL(iommu_group_release_dma_owner); /** + * iommu_device_release_dma_owner() - Release DMA ownership of a device + * @group: The device. + * + * Release the DMA ownership claimed by iommu_device_claim_dma_owner(). + */ +void iommu_device_release_dma_owner(struct device *dev) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + if (group->owner_cnt > 1) + group->owner_cnt--; + else + __iommu_release_dma_ownership(group); + mutex_unlock(&group->mutex); + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_device_release_dma_owner); + +/** * iommu_group_dma_owner_claimed() - Query group dma ownership status * @group: The group. * @@ -3231,3 +3262,150 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group) return user; } EXPORT_SYMBOL_GPL(iommu_group_dma_owner_claimed); + +static int __iommu_set_group_pasid(struct iommu_domain *domain, + struct iommu_group *group, ioasid_t pasid) +{ + struct group_device *device; + int ret = 0; + + list_for_each_entry(device, &group->devices, list) { + ret = domain->ops->set_dev_pasid(domain, device->dev, pasid); + if (ret) + break; + } + + return ret; +} + +static void __iommu_remove_group_pasid(struct iommu_group *group, + ioasid_t pasid) +{ + struct group_device *device; + const struct iommu_ops *ops; + + list_for_each_entry(device, &group->devices, list) { + ops = dev_iommu_ops(device->dev); + ops->remove_dev_pasid(device->dev, pasid); + } +} + +/* + * iommu_attach_device_pasid() - Attach a domain to pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * Return: 0 on success, or an error. + */ +int iommu_attach_device_pasid(struct iommu_domain *domain, + struct device *dev, ioasid_t pasid) +{ + struct iommu_group *group; + void *curr; + int ret; + + if (!domain->ops->set_dev_pasid) + return -EOPNOTSUPP; + + group = iommu_group_get(dev); + if (!group) + return -ENODEV; + + mutex_lock(&group->mutex); + curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL); + if (curr) { + ret = xa_err(curr) ? : -EBUSY; + goto out_unlock; + } + + ret = __iommu_set_group_pasid(domain, group, pasid); + if (ret) { + __iommu_remove_group_pasid(group, pasid); + xa_erase(&group->pasid_array, pasid); + } +out_unlock: + mutex_unlock(&group->mutex); + iommu_group_put(group); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_attach_device_pasid); + +/* + * iommu_detach_device_pasid() - Detach the domain from pasid of device + * @domain: the iommu domain. + * @dev: the attached device. + * @pasid: the pasid of the device. + * + * The @domain must have been attached to @pasid of the @dev with + * iommu_attach_device_pasid(). + */ +void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev, + ioasid_t pasid) +{ + struct iommu_group *group = iommu_group_get(dev); + + mutex_lock(&group->mutex); + __iommu_remove_group_pasid(group, pasid); + WARN_ON(xa_erase(&group->pasid_array, pasid) != domain); + mutex_unlock(&group->mutex); + + iommu_group_put(group); +} +EXPORT_SYMBOL_GPL(iommu_detach_device_pasid); + +/* + * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev + * @dev: the queried device + * @pasid: the pasid of the device + * @type: matched domain type, 0 for any match + * + * This is a variant of iommu_get_domain_for_dev(). It returns the existing + * domain attached to pasid of a device. Callers must hold a lock around this + * function, and both iommu_attach/detach_dev_pasid() whenever a domain of + * type is being manipulated. This API does not internally resolve races with + * attach/detach. + * + * Return: attached domain on success, NULL otherwise. + */ +struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev, + ioasid_t pasid, + unsigned int type) +{ + struct iommu_domain *domain; + struct iommu_group *group; + + group = iommu_group_get(dev); + if (!group) + return NULL; + + xa_lock(&group->pasid_array); + domain = xa_load(&group->pasid_array, pasid); + if (type && domain && domain->type != type) + domain = ERR_PTR(-EBUSY); + xa_unlock(&group->pasid_array); + iommu_group_put(group); + + return domain; +} +EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid); + +struct iommu_domain *iommu_sva_domain_alloc(struct device *dev, + struct mm_struct *mm) +{ + const struct iommu_ops *ops = dev_iommu_ops(dev); + struct iommu_domain *domain; + + domain = ops->domain_alloc(IOMMU_DOMAIN_SVA); + if (!domain) + return NULL; + + domain->type = IOMMU_DOMAIN_SVA; + mmgrab(mm); + domain->mm = mm; + domain->iopf_handler = iommu_sva_handle_iopf; + domain->fault_data = mm; + + return domain; +} diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig new file mode 100644 index 000000000000..8306616b6d81 --- /dev/null +++ b/drivers/iommu/iommufd/Kconfig @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: GPL-2.0-only +config IOMMUFD + tristate "IOMMU Userspace API" + select INTERVAL_TREE + select INTERVAL_TREE_SPAN_ITER + select IOMMU_API + default n + help + Provides /dev/iommu, the user API to control the IOMMU subsystem as + it relates to managing IO page tables that point at user space memory. + + If you don't know what to do here, say N. + +if IOMMUFD +config IOMMUFD_VFIO_CONTAINER + bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" + depends on VFIO && !VFIO_CONTAINER + default VFIO && !VFIO_CONTAINER + help + IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on + IOMMUFD providing compatibility emulation to give the same ioctls. + It provides an option to build a kernel with legacy VFIO components + removed. + + IOMMUFD VFIO container emulation is known to lack certain features + of the native VFIO container, such as no-IOMMU support, peer-to-peer + DMA mapping, PPC IOMMU support, as well as other potentially + undiscovered gaps. This option is currently intended for the + purpose of testing IOMMUFD with unmodified userspace supporting VFIO + and making use of the Type1 VFIO IOMMU backend. General purpose + enabling of this option is currently discouraged. + + Unless testing IOMMUFD, say N here. + +config IOMMUFD_TEST + bool "IOMMU Userspace API Test support" + depends on DEBUG_KERNEL + depends on FAULT_INJECTION + depends on RUNTIME_TESTING_MENU + default n + help + This is dangerous, do not enable unless running + tools/testing/selftests/iommu +endif diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile new file mode 100644 index 000000000000..8aeba81800c5 --- /dev/null +++ b/drivers/iommu/iommufd/Makefile @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0-only +iommufd-y := \ + device.o \ + hw_pagetable.o \ + io_pagetable.o \ + ioas.o \ + main.o \ + pages.o \ + vfio_compat.o + +iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o + +obj-$(CONFIG_IOMMUFD) += iommufd.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c new file mode 100644 index 000000000000..d81f93a321af --- /dev/null +++ b/drivers/iommu/iommufd/device.c @@ -0,0 +1,772 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/irqdomain.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" + +static bool allow_unsafe_interrupts; +module_param(allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC( + allow_unsafe_interrupts, + "Allow IOMMUFD to bind to devices even if the platform cannot isolate " + "the MSI interrupt window. Enabling this is a security weakness."); + +/* + * A iommufd_device object represents the binding relationship between a + * consuming driver and the iommufd. These objects are created/destroyed by + * external drivers, not by userspace. + */ +struct iommufd_device { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_hw_pagetable *hwpt; + /* Head at iommufd_hw_pagetable::devices */ + struct list_head devices_item; + /* always the physical device */ + struct device *dev; + struct iommu_group *group; + bool enforce_cache_coherency; +}; + +void iommufd_device_destroy(struct iommufd_object *obj) +{ + struct iommufd_device *idev = + container_of(obj, struct iommufd_device, obj); + + iommu_device_release_dma_owner(idev->dev); + iommu_group_put(idev->group); + iommufd_ctx_put(idev->ictx); +} + +/** + * iommufd_device_bind - Bind a physical device to an iommu fd + * @ictx: iommufd file descriptor + * @dev: Pointer to a physical device struct + * @id: Output ID number to return to userspace for this device + * + * A successful bind establishes an ownership over the device and returns + * struct iommufd_device pointer, otherwise returns error pointer. + * + * A driver using this API must set driver_managed_dma and must not touch + * the device until this routine succeeds and establishes ownership. + * + * Binding a PCI device places the entire RID under iommufd control. + * + * The caller must undo this with iommufd_device_unbind() + */ +struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, + struct device *dev, u32 *id) +{ + struct iommufd_device *idev; + struct iommu_group *group; + int rc; + + /* + * iommufd always sets IOMMU_CACHE because we offer no way for userspace + * to restore cache coherency. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) + return ERR_PTR(-EINVAL); + + group = iommu_group_get(dev); + if (!group) + return ERR_PTR(-ENODEV); + + rc = iommu_device_claim_dma_owner(dev, ictx); + if (rc) + goto out_group_put; + + idev = iommufd_object_alloc(ictx, idev, IOMMUFD_OBJ_DEVICE); + if (IS_ERR(idev)) { + rc = PTR_ERR(idev); + goto out_release_owner; + } + idev->ictx = ictx; + iommufd_ctx_get(ictx); + idev->dev = dev; + idev->enforce_cache_coherency = + device_iommu_capable(dev, IOMMU_CAP_ENFORCE_CACHE_COHERENCY); + /* The calling driver is a user until iommufd_device_unbind() */ + refcount_inc(&idev->obj.users); + /* group refcount moves into iommufd_device */ + idev->group = group; + + /* + * If the caller fails after this success it must call + * iommufd_unbind_device() which is safe since we hold this refcount. + * This also means the device is a leaf in the graph and no other object + * can take a reference on it. + */ + iommufd_object_finalize(ictx, &idev->obj); + *id = idev->obj.id; + return idev; + +out_release_owner: + iommu_device_release_dma_owner(dev); +out_group_put: + iommu_group_put(group); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); + +/** + * iommufd_device_unbind - Undo iommufd_device_bind() + * @idev: Device returned by iommufd_device_bind() + * + * Release the device from iommufd control. The DMA ownership will return back + * to unowned with DMA controlled by the DMA API. This invalidates the + * iommufd_device pointer, other APIs that consume it must not be called + * concurrently. + */ +void iommufd_device_unbind(struct iommufd_device *idev) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(idev->ictx, &idev->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); + +static int iommufd_device_setup_msi(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt, + phys_addr_t sw_msi_start) +{ + int rc; + + /* + * If the IOMMU driver gives a IOMMU_RESV_SW_MSI then it is asking us to + * call iommu_get_msi_cookie() on its behalf. This is necessary to setup + * the MSI window so iommu_dma_prepare_msi() can install pages into our + * domain after request_irq(). If it is not done interrupts will not + * work on this domain. + * + * FIXME: This is conceptually broken for iommufd since we want to allow + * userspace to change the domains, eg switch from an identity IOAS to a + * DMA IOAS. There is currently no way to create a MSI window that + * matches what the IRQ layer actually expects in a newly created + * domain. + */ + if (sw_msi_start != PHYS_ADDR_MAX && !hwpt->msi_cookie) { + rc = iommu_get_msi_cookie(hwpt->domain, sw_msi_start); + if (rc) + return rc; + + /* + * iommu_get_msi_cookie() can only be called once per domain, + * it returns -EBUSY on later calls. + */ + hwpt->msi_cookie = true; + } + + /* + * For historical compat with VFIO the insecure interrupt path is + * allowed if the module parameter is set. Insecure means that a MemWr + * operation from the device (eg a simple DMA) cannot trigger an + * interrupt outside this iommufd context. + */ + if (!device_iommu_capable(idev->dev, IOMMU_CAP_INTR_REMAP) && + !irq_domain_check_msi_remap()) { + if (!allow_unsafe_interrupts) + return -EPERM; + + dev_warn( + idev->dev, + "MSI interrupts are not secure, they cannot be isolated by the platform. " + "Check that platform features like interrupt remapping are enabled. " + "Use the \"allow_unsafe_interrupts\" module parameter to override\n"); + } + return 0; +} + +static bool iommufd_hw_pagetable_has_group(struct iommufd_hw_pagetable *hwpt, + struct iommu_group *group) +{ + struct iommufd_device *cur_dev; + + list_for_each_entry(cur_dev, &hwpt->devices, devices_item) + if (cur_dev->group == group) + return true; + return false; +} + +static int iommufd_device_do_attach(struct iommufd_device *idev, + struct iommufd_hw_pagetable *hwpt) +{ + phys_addr_t sw_msi_start = PHYS_ADDR_MAX; + int rc; + + mutex_lock(&hwpt->devices_lock); + + /* + * Try to upgrade the domain we have, it is an iommu driver bug to + * report IOMMU_CAP_ENFORCE_CACHE_COHERENCY but fail + * enforce_cache_coherency when there are no devices attached to the + * domain. + */ + if (idev->enforce_cache_coherency && !hwpt->enforce_cache_coherency) { + if (hwpt->domain->ops->enforce_cache_coherency) + hwpt->enforce_cache_coherency = + hwpt->domain->ops->enforce_cache_coherency( + hwpt->domain); + if (!hwpt->enforce_cache_coherency) { + WARN_ON(list_empty(&hwpt->devices)); + rc = -EINVAL; + goto out_unlock; + } + } + + rc = iopt_table_enforce_group_resv_regions(&hwpt->ioas->iopt, idev->dev, + idev->group, &sw_msi_start); + if (rc) + goto out_unlock; + + rc = iommufd_device_setup_msi(idev, hwpt, sw_msi_start); + if (rc) + goto out_iova; + + /* + * FIXME: Hack around missing a device-centric iommu api, only attach to + * the group once for the first device that is in the group. + */ + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + rc = iommu_attach_group(hwpt->domain, idev->group); + if (rc) + goto out_iova; + + if (list_empty(&hwpt->devices)) { + rc = iopt_table_add_domain(&hwpt->ioas->iopt, + hwpt->domain); + if (rc) + goto out_detach; + } + } + + idev->hwpt = hwpt; + refcount_inc(&hwpt->obj.users); + list_add(&idev->devices_item, &hwpt->devices); + mutex_unlock(&hwpt->devices_lock); + return 0; + +out_detach: + iommu_detach_group(hwpt->domain, idev->group); +out_iova: + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); +out_unlock: + mutex_unlock(&hwpt->devices_lock); + return rc; +} + +/* + * When automatically managing the domains we search for a compatible domain in + * the iopt and if one is found use it, otherwise create a new domain. + * Automatic domain selection will never pick a manually created domain. + */ +static int iommufd_device_auto_get_domain(struct iommufd_device *idev, + struct iommufd_ioas *ioas) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + /* + * There is no differentiation when domains are allocated, so any domain + * that is willing to attach to the device is interchangeable with any + * other. + */ + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->auto_domain) + continue; + + rc = iommufd_device_do_attach(idev, hwpt); + + /* + * -EINVAL means the domain is incompatible with the device. + * Other error codes should propagate to userspace as failure. + * Success means the domain is attached. + */ + if (rc == -EINVAL) + continue; + goto out_unlock; + } + + hwpt = iommufd_hw_pagetable_alloc(idev->ictx, ioas, idev->dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_unlock; + } + hwpt->auto_domain = true; + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_abort; + list_add_tail(&hwpt->hwpt_item, &ioas->hwpt_list); + + mutex_unlock(&ioas->mutex); + iommufd_object_finalize(idev->ictx, &hwpt->obj); + return 0; + +out_abort: + iommufd_object_abort_and_destroy(idev->ictx, &hwpt->obj); +out_unlock: + mutex_unlock(&ioas->mutex); + return rc; +} + +/** + * iommufd_device_attach - Connect a device from an iommu_domain + * @idev: device to attach + * @pt_id: Input a IOMMUFD_OBJ_IOAS, or IOMMUFD_OBJ_HW_PAGETABLE + * Output the IOMMUFD_OBJ_HW_PAGETABLE ID + * + * This connects the device to an iommu_domain, either automatically or manually + * selected. Once this completes the device could do DMA. + * + * The caller should return the resulting pt_id back to userspace. + * This function is undone by calling iommufd_device_detach(). + */ +int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id) +{ + struct iommufd_object *pt_obj; + int rc; + + pt_obj = iommufd_get_object(idev->ictx, *pt_id, IOMMUFD_OBJ_ANY); + if (IS_ERR(pt_obj)) + return PTR_ERR(pt_obj); + + switch (pt_obj->type) { + case IOMMUFD_OBJ_HW_PAGETABLE: { + struct iommufd_hw_pagetable *hwpt = + container_of(pt_obj, struct iommufd_hw_pagetable, obj); + + rc = iommufd_device_do_attach(idev, hwpt); + if (rc) + goto out_put_pt_obj; + + mutex_lock(&hwpt->ioas->mutex); + list_add_tail(&hwpt->hwpt_item, &hwpt->ioas->hwpt_list); + mutex_unlock(&hwpt->ioas->mutex); + break; + } + case IOMMUFD_OBJ_IOAS: { + struct iommufd_ioas *ioas = + container_of(pt_obj, struct iommufd_ioas, obj); + + rc = iommufd_device_auto_get_domain(idev, ioas); + if (rc) + goto out_put_pt_obj; + break; + } + default: + rc = -EINVAL; + goto out_put_pt_obj; + } + + refcount_inc(&idev->obj.users); + *pt_id = idev->hwpt->obj.id; + rc = 0; + +out_put_pt_obj: + iommufd_put_object(pt_obj); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_attach, IOMMUFD); + +/** + * iommufd_device_detach - Disconnect a device to an iommu_domain + * @idev: device to detach + * + * Undo iommufd_device_attach(). This disconnects the idev from the previously + * attached pt_id. The device returns back to a blocked DMA translation. + */ +void iommufd_device_detach(struct iommufd_device *idev) +{ + struct iommufd_hw_pagetable *hwpt = idev->hwpt; + + mutex_lock(&hwpt->ioas->mutex); + mutex_lock(&hwpt->devices_lock); + list_del(&idev->devices_item); + if (!iommufd_hw_pagetable_has_group(hwpt, idev->group)) { + if (list_empty(&hwpt->devices)) { + iopt_table_remove_domain(&hwpt->ioas->iopt, + hwpt->domain); + list_del(&hwpt->hwpt_item); + } + iommu_detach_group(hwpt->domain, idev->group); + } + iopt_remove_reserved_iova(&hwpt->ioas->iopt, idev->dev); + mutex_unlock(&hwpt->devices_lock); + mutex_unlock(&hwpt->ioas->mutex); + + if (hwpt->auto_domain) + iommufd_object_destroy_user(idev->ictx, &hwpt->obj); + else + refcount_dec(&hwpt->obj.users); + + idev->hwpt = NULL; + + refcount_dec(&idev->obj.users); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_detach, IOMMUFD); + +void iommufd_access_destroy_object(struct iommufd_object *obj) +{ + struct iommufd_access *access = + container_of(obj, struct iommufd_access, obj); + + iopt_remove_access(&access->ioas->iopt, access); + iommufd_ctx_put(access->ictx); + refcount_dec(&access->ioas->obj.users); +} + +/** + * iommufd_access_create - Create an iommufd_access + * @ictx: iommufd file descriptor + * @ioas_id: ID for a IOMMUFD_OBJ_IOAS + * @ops: Driver's ops to associate with the access + * @data: Opaque data to pass into ops functions + * + * An iommufd_access allows a driver to read/write to the IOAS without using + * DMA. The underlying CPU memory can be accessed using the + * iommufd_access_pin_pages() or iommufd_access_rw() functions. + * + * The provided ops are required to use iommufd_access_pin_pages(). + */ +struct iommufd_access * +iommufd_access_create(struct iommufd_ctx *ictx, u32 ioas_id, + const struct iommufd_access_ops *ops, void *data) +{ + struct iommufd_access *access; + struct iommufd_object *obj; + int rc; + + /* + * There is no uAPI for the access object, but to keep things symmetric + * use the object infrastructure anyhow. + */ + access = iommufd_object_alloc(ictx, access, IOMMUFD_OBJ_ACCESS); + if (IS_ERR(access)) + return access; + + access->data = data; + access->ops = ops; + + obj = iommufd_get_object(ictx, ioas_id, IOMMUFD_OBJ_IOAS); + if (IS_ERR(obj)) { + rc = PTR_ERR(obj); + goto out_abort; + } + access->ioas = container_of(obj, struct iommufd_ioas, obj); + iommufd_ref_to_users(obj); + + if (ops->needs_pin_pages) + access->iova_alignment = PAGE_SIZE; + else + access->iova_alignment = 1; + rc = iopt_add_access(&access->ioas->iopt, access); + if (rc) + goto out_put_ioas; + + /* The calling driver is a user until iommufd_access_destroy() */ + refcount_inc(&access->obj.users); + access->ictx = ictx; + iommufd_ctx_get(ictx); + iommufd_object_finalize(ictx, &access->obj); + return access; +out_put_ioas: + refcount_dec(&access->ioas->obj.users); +out_abort: + iommufd_object_abort(ictx, &access->obj); + return ERR_PTR(rc); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); + +/** + * iommufd_access_destroy - Destroy an iommufd_access + * @access: The access to destroy + * + * The caller must stop using the access before destroying it. + */ +void iommufd_access_destroy(struct iommufd_access *access) +{ + bool was_destroyed; + + was_destroyed = iommufd_object_destroy_user(access->ictx, &access->obj); + WARN_ON(!was_destroyed); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); + +/** + * iommufd_access_notify_unmap - Notify users of an iopt to stop using it + * @iopt: iopt to work on + * @iova: Starting iova in the iopt + * @length: Number of bytes + * + * After this function returns there should be no users attached to the pages + * linked to this iopt that intersect with iova,length. Anyone that has attached + * a user through iopt_access_pages() needs to detach it through + * iommufd_access_unpin_pages() before this function returns. + * + * iommufd_access_destroy() will wait for any outstanding unmap callback to + * complete. Once iommufd_access_destroy() no unmap ops are running or will + * run in the future. Due to this a driver must not create locking that prevents + * unmap to complete while iommufd_access_destroy() is running. + */ +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + struct iommufd_ioas *ioas = + container_of(iopt, struct iommufd_ioas, iopt); + struct iommufd_access *access; + unsigned long index; + + xa_lock(&ioas->iopt.access_list); + xa_for_each(&ioas->iopt.access_list, index, access) { + if (!iommufd_lock_obj(&access->obj)) + continue; + xa_unlock(&ioas->iopt.access_list); + + access->ops->unmap(access->data, iova, length); + + iommufd_put_object(&access->obj); + xa_lock(&ioas->iopt.access_list); + } + xa_unlock(&ioas->iopt.access_list); +} + +/** + * iommufd_access_unpin_pages() - Undo iommufd_access_pin_pages + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * + * Return the struct page's. The caller must stop accessing them before calling + * this. The iova/length must exactly match the one provided to access_pages. + */ +void iommufd_access_unpin_pages(struct iommufd_access *access, + unsigned long iova, unsigned long length) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + + if (WARN_ON(!length) || + WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) + return; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, + min(last_iova, iopt_area_last_iova(area)))); + up_read(&iopt->iova_rwsem); + WARN_ON(!iopt_area_contig_done(&iter)); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); + +static bool iopt_area_contig_is_aligned(struct iopt_area_contig_iter *iter) +{ + if (iopt_area_start_byte(iter->area, iter->cur_iova) % PAGE_SIZE) + return false; + + if (!iopt_area_contig_done(iter) && + (iopt_area_start_byte(iter->area, iopt_area_last_iova(iter->area)) % + PAGE_SIZE) != (PAGE_SIZE - 1)) + return false; + return true; +} + +static bool check_area_prot(struct iopt_area *area, unsigned int flags) +{ + if (flags & IOMMUFD_ACCESS_RW_WRITE) + return area->iommu_prot & IOMMU_WRITE; + return area->iommu_prot & IOMMU_READ; +} + +/** + * iommufd_access_pin_pages() - Return a list of pages under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @length: Number of bytes to access + * @out_pages: Output page list + * @flags: IOPMMUFD_ACCESS_RW_* flags + * + * Reads @length bytes starting at iova and returns the struct page * pointers. + * These can be kmap'd by the caller for CPU access. + * + * The caller must perform iommufd_access_unpin_pages() when done to balance + * this. + * + * This API always requires a page aligned iova. This happens naturally if the + * ioas alignment is >= PAGE_SIZE and the iova is PAGE_SIZE aligned. However + * smaller alignments have corner cases where this API can fail on otherwise + * aligned iova. + */ +int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, + unsigned long length, struct page **out_pages, + unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + /* Driver's ops don't support pin_pages */ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(access->iova_alignment != PAGE_SIZE || !access->ops->unmap)) + return -EINVAL; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long last_index = iopt_area_iova_to_index(area, last); + unsigned long index = + iopt_area_iova_to_index(area, iter.cur_iova); + + if (area->prevent_access || + !iopt_area_contig_is_aligned(&iter)) { + rc = -EINVAL; + goto err_remove; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_remove; + } + + rc = iopt_area_add_access(area, index, last_index, out_pages, + flags); + if (rc) + goto err_remove; + out_pages += last_index - index + 1; + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_remove; + } + + up_read(&iopt->iova_rwsem); + return 0; + +err_remove: + if (iova < iter.cur_iova) { + last_iova = iter.cur_iova - 1; + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) + iopt_area_remove_access( + area, + iopt_area_iova_to_index(area, iter.cur_iova), + iopt_area_iova_to_index( + area, min(last_iova, + iopt_area_last_iova(area)))); + } + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); + +/** + * iommufd_access_rw - Read or write data under the iova + * @access: IOAS access to act on + * @iova: Starting IOVA + * @data: Kernel buffer to copy to/from + * @length: Number of bytes to access + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Copy kernel to/from data into the range given by IOVA/length. If flags + * indicates IOMMUFD_ACCESS_RW_KTHREAD then a large copy can be optimized + * by changing it into copy_to/from_user(). + */ +int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, + void *data, size_t length, unsigned int flags) +{ + struct io_pagetable *iopt = &access->ioas->iopt; + struct iopt_area_contig_iter iter; + struct iopt_area *area; + unsigned long last_iova; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + unsigned long bytes = (last - iter.cur_iova) + 1; + + if (area->prevent_access) { + rc = -EINVAL; + goto err_out; + } + + if (!check_area_prot(area, flags)) { + rc = -EPERM; + goto err_out; + } + + rc = iopt_pages_rw_access( + area->pages, iopt_area_start_byte(area, iter.cur_iova), + data, bytes, flags); + if (rc) + goto err_out; + data += bytes; + } + if (!iopt_area_contig_done(&iter)) + rc = -ENOENT; +err_out: + up_read(&iopt->iova_rwsem); + return rc; +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); + +#ifdef CONFIG_IOMMUFD_TEST +/* + * Creating a real iommufd_device is too hard, bypass creating a iommufd_device + * and go directly to attaching a domain. + */ +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_hw_pagetable_alloc(ictx, ioas, mock_dev); + if (IS_ERR(hwpt)) + return hwpt; + + rc = iopt_table_add_domain(&hwpt->ioas->iopt, hwpt->domain); + if (rc) + goto out_hwpt; + + refcount_inc(&hwpt->obj.users); + iommufd_object_finalize(ictx, &hwpt->obj); + return hwpt; + +out_hwpt: + iommufd_object_abort_and_destroy(ictx, &hwpt->obj); + return ERR_PTR(rc); +} + +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt) +{ + iopt_table_remove_domain(&hwpt->ioas->iopt, hwpt->domain); + refcount_dec(&hwpt->obj.users); +} +#endif diff --git a/drivers/iommu/iommufd/double_span.h b/drivers/iommu/iommufd/double_span.h new file mode 100644 index 000000000000..b37aab7488c0 --- /dev/null +++ b/drivers/iommu/iommufd/double_span.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef __IOMMUFD_DOUBLE_SPAN_H +#define __IOMMUFD_DOUBLE_SPAN_H + +#include <linux/interval_tree.h> + +/* + * This is a variation of the general interval_tree_span_iter that computes the + * spans over the union of two different interval trees. Used ranges are broken + * up and reported based on the tree that provides the interval. The first span + * always takes priority. Like interval_tree_span_iter it is greedy and the same + * value of is_used will not repeat on two iteration cycles. + */ +struct interval_tree_double_span_iter { + struct rb_root_cached *itrees[2]; + struct interval_tree_span_iter spans[2]; + union { + unsigned long start_hole; + unsigned long start_used; + }; + union { + unsigned long last_hole; + unsigned long last_used; + }; + /* 0 = hole, 1 = used span[0], 2 = used span[1], -1 done iteration */ + int is_used; +}; + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter); +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index); +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter); + +static inline bool +interval_tree_double_span_iter_done(struct interval_tree_double_span_iter *state) +{ + return state->is_used == -1; +} + +#define interval_tree_for_each_double_span(span, itree1, itree2, first_index, \ + last_index) \ + for (interval_tree_double_span_iter_first(span, itree1, itree2, \ + first_index, last_index); \ + !interval_tree_double_span_iter_done(span); \ + interval_tree_double_span_iter_next(span)) + +#endif diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c new file mode 100644 index 000000000000..43d473989a06 --- /dev/null +++ b/drivers/iommu/iommufd/hw_pagetable.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/iommu.h> + +#include "iommufd_private.h" + +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj) +{ + struct iommufd_hw_pagetable *hwpt = + container_of(obj, struct iommufd_hw_pagetable, obj); + + WARN_ON(!list_empty(&hwpt->devices)); + + iommu_domain_free(hwpt->domain); + refcount_dec(&hwpt->ioas->obj.users); + mutex_destroy(&hwpt->devices_lock); +} + +/** + * iommufd_hw_pagetable_alloc() - Get an iommu_domain for a device + * @ictx: iommufd context + * @ioas: IOAS to associate the domain with + * @dev: Device to get an iommu_domain for + * + * Allocate a new iommu_domain and return it as a hw_pagetable. + */ +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev) +{ + struct iommufd_hw_pagetable *hwpt; + int rc; + + hwpt = iommufd_object_alloc(ictx, hwpt, IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(hwpt)) + return hwpt; + + hwpt->domain = iommu_domain_alloc(dev->bus); + if (!hwpt->domain) { + rc = -ENOMEM; + goto out_abort; + } + + INIT_LIST_HEAD(&hwpt->devices); + INIT_LIST_HEAD(&hwpt->hwpt_item); + mutex_init(&hwpt->devices_lock); + /* Pairs with iommufd_hw_pagetable_destroy() */ + refcount_inc(&ioas->obj.users); + hwpt->ioas = ioas; + return hwpt; + +out_abort: + iommufd_object_abort(ictx, &hwpt->obj); + return ERR_PTR(rc); +} diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c new file mode 100644 index 000000000000..e0ae72b9e67f --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -0,0 +1,1216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The + * PFNs can be placed into an iommu_domain, or returned to the caller as a page + * list for access by an in-kernel user. + * + * The datastructure uses the iopt_pages to optimize the storage of the PFNs + * between the domains and xarray. + */ +#include <linux/iommufd.h> +#include <linux/lockdep.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/errno.h> + +#include "io_pagetable.h" +#include "double_span.h" + +struct iopt_pages_list { + struct iopt_pages *pages; + struct iopt_area *area; + struct list_head next; + unsigned long start_byte; + unsigned long length; +}; + +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova) +{ + lockdep_assert_held(&iopt->iova_rwsem); + + iter->cur_iova = iova; + iter->last_iova = last_iova; + iter->area = iopt_area_iter_first(iopt, iova, iova); + if (!iter->area) + return NULL; + if (!iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) +{ + unsigned long last_iova; + + if (!iter->area) + return NULL; + last_iova = iopt_area_last_iova(iter->area); + if (iter->last_iova <= last_iova) + return NULL; + + iter->cur_iova = last_iova + 1; + iter->area = iopt_area_iter_next(iter->area, iter->cur_iova, + iter->last_iova); + if (!iter->area) + return NULL; + if (iter->cur_iova != iopt_area_iova(iter->area) || + !iter->area->pages) { + iter->area = NULL; + return NULL; + } + return iter->area; +} + +static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_used || span->last_hole - span->start_hole < length - 1) + return false; + + span->start_hole = ALIGN(span->start_hole, iova_alignment) | + page_offset; + if (span->start_hole > span->last_hole || + span->last_hole - span->start_hole < length - 1) + return false; + return true; +} + +static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, + unsigned long length, + unsigned long iova_alignment, + unsigned long page_offset) +{ + if (span->is_hole || span->last_used - span->start_used < length - 1) + return false; + + span->start_used = ALIGN(span->start_used, iova_alignment) | + page_offset; + if (span->start_used > span->last_used || + span->last_used - span->start_used < length - 1) + return false; + return true; +} + +/* + * Automatically find a block of IOVA that is not being used and not reserved. + * Does not return a 0 IOVA even if it is valid. + */ +static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, + unsigned long uptr, unsigned long length) +{ + unsigned long page_offset = uptr % PAGE_SIZE; + struct interval_tree_double_span_iter used_span; + struct interval_tree_span_iter allowed_span; + unsigned long iova_alignment; + + lockdep_assert_held(&iopt->iova_rwsem); + + /* Protect roundup_pow-of_two() from overflow */ + if (length == 0 || length >= ULONG_MAX / 2) + return -EOVERFLOW; + + /* + * Keep alignment present in the uptr when building the IOVA, this + * increases the chance we can map a THP. + */ + if (!uptr) + iova_alignment = roundup_pow_of_two(length); + else + iova_alignment = min_t(unsigned long, + roundup_pow_of_two(length), + 1UL << __ffs64(uptr)); + + if (iova_alignment < iopt->iova_alignment) + return -EINVAL; + + interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, + PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { + if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { + allowed_span.start_used = PAGE_SIZE; + allowed_span.last_used = ULONG_MAX - PAGE_SIZE; + allowed_span.is_hole = false; + } + + if (!__alloc_iova_check_used(&allowed_span, length, + iova_alignment, page_offset)) + continue; + + interval_tree_for_each_double_span( + &used_span, &iopt->reserved_itree, &iopt->area_itree, + allowed_span.start_used, allowed_span.last_used) { + if (!__alloc_iova_check_hole(&used_span, length, + iova_alignment, + page_offset)) + continue; + + *iova = used_span.start_hole; + return 0; + } + } + return -ENOSPC; +} + +static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length) +{ + unsigned long last; + + lockdep_assert_held(&iopt->iova_rwsem); + + if ((iova & (iopt->iova_alignment - 1))) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &last)) + return -EOVERFLOW; + + /* No reserved IOVA intersects the range */ + if (iopt_reserved_iter_first(iopt, iova, last)) + return -EINVAL; + + /* Check that there is not already a mapping in the range */ + if (iopt_area_iter_first(iopt, iova, last)) + return -EEXIST; + return 0; +} + +/* + * The area takes a slice of the pages from start_bytes to start_byte + length + */ +static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, + struct iopt_pages *pages, unsigned long iova, + unsigned long start_byte, unsigned long length, + int iommu_prot) +{ + lockdep_assert_held_write(&iopt->iova_rwsem); + + if ((iommu_prot & IOMMU_WRITE) && !pages->writable) + return -EPERM; + + area->iommu_prot = iommu_prot; + area->page_offset = start_byte % PAGE_SIZE; + if (area->page_offset & (iopt->iova_alignment - 1)) + return -EINVAL; + + area->node.start = iova; + if (check_add_overflow(iova, length - 1, &area->node.last)) + return -EOVERFLOW; + + area->pages_node.start = start_byte / PAGE_SIZE; + if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) + return -EOVERFLOW; + area->pages_node.last = area->pages_node.last / PAGE_SIZE; + if (WARN_ON(area->pages_node.last >= pages->npages)) + return -EOVERFLOW; + + /* + * The area is inserted with a NULL pages indicating it is not fully + * initialized yet. + */ + area->iopt = iopt; + interval_tree_insert(&area->node, &iopt->area_itree); + return 0; +} + +static int iopt_alloc_area_pages(struct io_pagetable *iopt, + struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + unsigned long iova; + int rc = 0; + + list_for_each_entry(elm, pages_list, next) { + elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT); + if (!elm->area) + return -ENOMEM; + } + + down_write(&iopt->iova_rwsem); + if ((length & (iopt->iova_alignment - 1)) || !length) { + rc = -EINVAL; + goto out_unlock; + } + + if (flags & IOPT_ALLOC_IOVA) { + /* Use the first entry to guess the ideal IOVA alignment */ + elm = list_first_entry(pages_list, struct iopt_pages_list, + next); + rc = iopt_alloc_iova( + iopt, dst_iova, + (uintptr_t)elm->pages->uptr + elm->start_byte, length); + if (rc) + goto out_unlock; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { + rc = -EINVAL; + goto out_unlock; + } + } else { + rc = iopt_check_iova(iopt, *dst_iova, length); + if (rc) + goto out_unlock; + } + + /* + * Areas are created with a NULL pages so that the IOVA space is + * reserved and we can unlock the iova_rwsem. + */ + iova = *dst_iova; + list_for_each_entry(elm, pages_list, next) { + rc = iopt_insert_area(iopt, elm->area, elm->pages, iova, + elm->start_byte, elm->length, iommu_prot); + if (rc) + goto out_unlock; + iova += elm->length; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} + +static void iopt_abort_area(struct iopt_area *area) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(area->pages); + if (area->iopt) { + down_write(&area->iopt->iova_rwsem); + interval_tree_remove(&area->node, &area->iopt->area_itree); + up_write(&area->iopt->iova_rwsem); + } + kfree(area); +} + +void iopt_free_pages_list(struct list_head *pages_list) +{ + struct iopt_pages_list *elm; + + while ((elm = list_first_entry_or_null(pages_list, + struct iopt_pages_list, next))) { + if (elm->area) + iopt_abort_area(elm->area); + if (elm->pages) + iopt_put_pages(elm->pages); + list_del(&elm->next); + kfree(elm); + } +} + +static int iopt_fill_domains_pages(struct list_head *pages_list) +{ + struct iopt_pages_list *undo_elm; + struct iopt_pages_list *elm; + int rc; + + list_for_each_entry(elm, pages_list, next) { + rc = iopt_area_fill_domains(elm->area, elm->pages); + if (rc) + goto err_undo; + } + return 0; + +err_undo: + list_for_each_entry(undo_elm, pages_list, next) { + if (undo_elm == elm) + break; + iopt_area_unfill_domains(undo_elm->area, undo_elm->pages); + } + return rc; +} + +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags) +{ + struct iopt_pages_list *elm; + int rc; + + rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, + iommu_prot, flags); + if (rc) + return rc; + + down_read(&iopt->domains_rwsem); + rc = iopt_fill_domains_pages(pages_list); + if (rc) + goto out_unlock_domains; + + down_write(&iopt->iova_rwsem); + list_for_each_entry(elm, pages_list, next) { + /* + * area->pages must be set inside the domains_rwsem to ensure + * any newly added domains will get filled. Moves the reference + * in from the list. + */ + elm->area->pages = elm->pages; + elm->pages = NULL; + elm->area = NULL; + } + up_write(&iopt->iova_rwsem); +out_unlock_domains: + up_read(&iopt->domains_rwsem); + return rc; +} + +/** + * iopt_map_user_pages() - Map a user VA to an iova in the io page table + * @ictx: iommufd_ctx the iopt is part of + * @iopt: io_pagetable to act on + * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains + * the chosen iova on output. Otherwise is the iova to map to on input + * @uptr: User VA to map + * @length: Number of bytes to map + * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping + * @flags: IOPT_ALLOC_IOVA or zero + * + * iova, uptr, and length must be aligned to iova_alignment. For domain backed + * page tables this will pin the pages and load them into the domain at iova. + * For non-domain page tables this will only setup a lazy reference and the + * caller must use iopt_access_pages() to touch them. + * + * iopt_unmap_iova() must be called to undo this before the io_pagetable can be + * destroyed. + */ +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags) +{ + struct iopt_pages_list elm = {}; + LIST_HEAD(pages_list); + int rc; + + elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE); + if (IS_ERR(elm.pages)) + return PTR_ERR(elm.pages); + if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && + elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) + elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; + elm.start_byte = uptr - elm.pages->uptr; + elm.length = length; + list_add(&elm.next, &pages_list); + + rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags); + if (rc) { + if (elm.area) + iopt_abort_area(elm.area); + if (elm.pages) + iopt_put_pages(elm.pages); + return rc; + } + return 0; +} + +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list) +{ + struct iopt_area_contig_iter iter; + unsigned long last_iova; + struct iopt_area *area; + int rc; + + if (!length) + return -EINVAL; + if (check_add_overflow(iova, length - 1, &last_iova)) + return -EOVERFLOW; + + down_read(&iopt->iova_rwsem); + iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { + struct iopt_pages_list *elm; + unsigned long last = min(last_iova, iopt_area_last_iova(area)); + + elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT); + if (!elm) { + rc = -ENOMEM; + goto err_free; + } + elm->start_byte = iopt_area_start_byte(area, iter.cur_iova); + elm->pages = area->pages; + elm->length = (last - iter.cur_iova) + 1; + kref_get(&elm->pages->kref); + list_add_tail(&elm->next, pages_list); + } + if (!iopt_area_contig_done(&iter)) { + rc = -ENOENT; + goto err_free; + } + up_read(&iopt->iova_rwsem); + return 0; +err_free: + up_read(&iopt->iova_rwsem); + iopt_free_pages_list(pages_list); + return rc; +} + +static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, + unsigned long last, unsigned long *unmapped) +{ + struct iopt_area *area; + unsigned long unmapped_bytes = 0; + int rc = -ENOENT; + + /* + * The domains_rwsem must be held in read mode any time any area->pages + * is NULL. This prevents domain attach/detatch from running + * concurrently with cleaning up the area. + */ +again: + down_read(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + while ((area = iopt_area_iter_first(iopt, start, last))) { + unsigned long area_last = iopt_area_last_iova(area); + unsigned long area_first = iopt_area_iova(area); + struct iopt_pages *pages; + + /* Userspace should not race map/unmap's of the same area */ + if (!area->pages) { + rc = -EBUSY; + goto out_unlock_iova; + } + + if (area_first < start || area_last > last) { + rc = -ENOENT; + goto out_unlock_iova; + } + + /* + * num_accesses writers must hold the iova_rwsem too, so we can + * safely read it under the write side of the iovam_rwsem + * without the pages->mutex. + */ + if (area->num_accesses) { + start = area_first; + area->prevent_access = true; + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + iommufd_access_notify_unmap(iopt, area_first, + iopt_area_length(area)); + if (WARN_ON(READ_ONCE(area->num_accesses))) + return -EDEADLOCK; + goto again; + } + + pages = area->pages; + area->pages = NULL; + up_write(&iopt->iova_rwsem); + + iopt_area_unfill_domains(area, pages); + iopt_abort_area(area); + iopt_put_pages(pages); + + unmapped_bytes += area_last - area_first + 1; + + down_write(&iopt->iova_rwsem); + } + if (unmapped_bytes) + rc = 0; + +out_unlock_iova: + up_write(&iopt->iova_rwsem); + up_read(&iopt->domains_rwsem); + if (unmapped) + *unmapped = unmapped_bytes; + return rc; +} + +/** + * iopt_unmap_iova() - Remove a range of iova + * @iopt: io_pagetable to act on + * @iova: Starting iova to unmap + * @length: Number of bytes to unmap + * @unmapped: Return number of bytes unmapped + * + * The requested range must be a superset of existing ranges. + * Splitting/truncating IOVA mappings is not allowed. + */ +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped) +{ + unsigned long iova_last; + + if (!length) + return -EINVAL; + + if (check_add_overflow(iova, length - 1, &iova_last)) + return -EOVERFLOW; + + return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped); +} + +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) +{ + int rc; + + rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped); + /* If the IOVAs are empty then unmap all succeeds */ + if (rc == -ENOENT) + return 0; + return rc; +} + +/* The caller must always free all the nodes in the allowed_iova rb_root. */ +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova) +{ + struct iopt_allowed *allowed; + + down_write(&iopt->iova_rwsem); + swap(*allowed_iova, iopt->allowed_itree); + + for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed; + allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) { + if (iopt_reserved_iter_first(iopt, allowed->node.start, + allowed->node.last)) { + swap(*allowed_iova, iopt->allowed_itree); + up_write(&iopt->iova_rwsem); + return -EADDRINUSE; + } + } + up_write(&iopt->iova_rwsem); + return 0; +} + +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner) +{ + struct iopt_reserved *reserved; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iopt_area_iter_first(iopt, start, last) || + iopt_allowed_iter_first(iopt, start, last)) + return -EADDRINUSE; + + reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT); + if (!reserved) + return -ENOMEM; + reserved->node.start = start; + reserved->node.last = last; + reserved->owner = owner; + interval_tree_insert(&reserved->node, &iopt->reserved_itree); + return 0; +} + +static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + struct iopt_reserved *reserved, *next; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved; + reserved = next) { + next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX); + + if (reserved->owner == owner) { + interval_tree_remove(&reserved->node, + &iopt->reserved_itree); + kfree(reserved); + } + } +} + +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) +{ + down_write(&iopt->iova_rwsem); + __iopt_remove_reserved_iova(iopt, owner); + up_write(&iopt->iova_rwsem); +} + +void iopt_init_table(struct io_pagetable *iopt) +{ + init_rwsem(&iopt->iova_rwsem); + init_rwsem(&iopt->domains_rwsem); + iopt->area_itree = RB_ROOT_CACHED; + iopt->allowed_itree = RB_ROOT_CACHED; + iopt->reserved_itree = RB_ROOT_CACHED; + xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT); + xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC); + + /* + * iopt's start as SW tables that can use the entire size_t IOVA space + * due to the use of size_t in the APIs. They have no alignment + * restriction. + */ + iopt->iova_alignment = 1; +} + +void iopt_destroy_table(struct io_pagetable *iopt) +{ + struct interval_tree_node *node; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + iopt_remove_reserved_iova(iopt, NULL); + + while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0, + ULONG_MAX))) { + interval_tree_remove(node, &iopt->allowed_itree); + kfree(container_of(node, struct iopt_allowed, node)); + } + + WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); + WARN_ON(!xa_empty(&iopt->domains)); + WARN_ON(!xa_empty(&iopt->access_list)); + WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); +} + +/** + * iopt_unfill_domain() - Unfill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to unfill + * + * This is used when removing a domain from the iopt. Every area in the iopt + * will be unmapped from the domain. The domain must already be removed from the + * domains xarray. + */ +static void iopt_unfill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + /* + * Some other domain is holding all the pfns still, rapidly unmap this + * domain. + */ + if (iopt->next_domain_id != 0) { + /* Pick an arbitrary remaining domain to act as storage */ + struct iommu_domain *storage_domain = + xa_load(&iopt->domains, 0); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!area->storage_domain); + if (area->storage_domain == domain) + area->storage_domain = storage_domain; + mutex_unlock(&pages->mutex); + + iopt_area_unmap_domain(area, domain); + } + return; + } + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + interval_tree_remove(&area->pages_node, &pages->domains_itree); + WARN_ON(area->storage_domain != domain); + area->storage_domain = NULL; + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } +} + +/** + * iopt_fill_domain() - Fill a domain with PFNs + * @iopt: io_pagetable to act on + * @domain: domain to fill + * + * Fill the domain with PFNs from every area in the iopt. On failure the domain + * is left unchanged. + */ +static int iopt_fill_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iopt_area *end_area; + struct iopt_area *area; + int rc; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held_write(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (!pages) + continue; + + mutex_lock(&pages->mutex); + rc = iopt_area_fill_domain(area, domain); + if (rc) { + mutex_unlock(&pages->mutex); + goto out_unfill; + } + if (!area->storage_domain) { + WARN_ON(iopt->next_domain_id != 0); + area->storage_domain = domain; + interval_tree_insert(&area->pages_node, + &pages->domains_itree); + } + mutex_unlock(&pages->mutex); + } + return 0; + +out_unfill: + end_area = area; + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + if (area == end_area) + break; + if (!pages) + continue; + mutex_lock(&pages->mutex); + if (iopt->next_domain_id == 0) { + interval_tree_remove(&area->pages_node, + &pages->domains_itree); + area->storage_domain = NULL; + } + iopt_area_unfill_domain(area, pages, domain); + mutex_unlock(&pages->mutex); + } + return rc; +} + +/* All existing area's conform to an increased page size */ +static int iopt_check_iova_alignment(struct io_pagetable *iopt, + unsigned long new_iova_alignment) +{ + unsigned long align_mask = new_iova_alignment - 1; + struct iopt_area *area; + + lockdep_assert_held(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) + if ((iopt_area_iova(area) & align_mask) || + (iopt_area_length(area) & align_mask) || + (area->page_offset & align_mask)) + return -EADDRINUSE; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { + struct iommufd_access *access; + unsigned long index; + + xa_for_each(&iopt->access_list, index, access) + if (WARN_ON(access->iova_alignment > + new_iova_alignment)) + return -EADDRINUSE; + } + return 0; +} + +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + const struct iommu_domain_geometry *geometry = &domain->geometry; + struct iommu_domain *iter_domain; + unsigned int new_iova_alignment; + unsigned long index; + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) { + if (WARN_ON(iter_domain == domain)) { + rc = -EEXIST; + goto out_unlock; + } + } + + /* + * The io page size drives the iova_alignment. Internally the iopt_pages + * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE + * objects into the iommu_domain. + * + * A iommu_domain must always be able to accept PAGE_SIZE to be + * compatible as we can't guarantee higher contiguity. + */ + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + iopt->iova_alignment); + if (new_iova_alignment > PAGE_SIZE) { + rc = -EINVAL; + goto out_unlock; + } + if (new_iova_alignment != iopt->iova_alignment) { + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + goto out_unlock; + } + + /* No area exists that is outside the allowed domain aperture */ + if (geometry->aperture_start != 0) { + rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1, + domain); + if (rc) + goto out_reserved; + } + if (geometry->aperture_end != ULONG_MAX) { + rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1, + ULONG_MAX, domain); + if (rc) + goto out_reserved; + } + + rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL); + if (rc) + goto out_reserved; + + rc = iopt_fill_domain(iopt, domain); + if (rc) + goto out_release; + + iopt->iova_alignment = new_iova_alignment; + xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL); + iopt->next_domain_id++; + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return 0; +out_release: + xa_release(&iopt->domains, iopt->next_domain_id); +out_reserved: + __iopt_remove_reserved_iova(iopt, domain); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) +{ + unsigned long new_iova_alignment; + struct iommufd_access *access; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held_write(&iopt->iova_rwsem); + lockdep_assert_held(&iopt->domains_rwsem); + + /* See batch_iommu_map_small() */ + if (iopt->disable_large_pages) + new_iova_alignment = PAGE_SIZE; + else + new_iova_alignment = 1; + + xa_for_each(&iopt->domains, index, domain) + new_iova_alignment = max_t(unsigned long, + 1UL << __ffs(domain->pgsize_bitmap), + new_iova_alignment); + xa_for_each(&iopt->access_list, index, access) + new_iova_alignment = max_t(unsigned long, + access->iova_alignment, + new_iova_alignment); + + if (new_iova_alignment > iopt->iova_alignment) { + int rc; + + rc = iopt_check_iova_alignment(iopt, new_iova_alignment); + if (rc) + return rc; + } + iopt->iova_alignment = new_iova_alignment; + return 0; +} + +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain) +{ + struct iommu_domain *iter_domain = NULL; + unsigned long index; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + + xa_for_each(&iopt->domains, index, iter_domain) + if (iter_domain == domain) + break; + if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) + goto out_unlock; + + /* + * Compress the xarray to keep it linear by swapping the entry to erase + * with the tail entry and shrinking the tail. + */ + iopt->next_domain_id--; + iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id); + if (index != iopt->next_domain_id) + xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL); + + iopt_unfill_domain(iopt, domain); + __iopt_remove_reserved_iova(iopt, domain); + + WARN_ON(iopt_calculate_iova_alignment(iopt)); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/** + * iopt_area_split - Split an area into two parts at iova + * @area: The area to split + * @iova: Becomes the last of a new area + * + * This splits an area into two. It is part of the VFIO compatibility to allow + * poking a hole in the mapping. The two areas continue to point at the same + * iopt_pages, just with different starting bytes. + */ +static int iopt_area_split(struct iopt_area *area, unsigned long iova) +{ + unsigned long alignment = area->iopt->iova_alignment; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned long start_iova = iopt_area_iova(area); + unsigned long new_start = iova + 1; + struct io_pagetable *iopt = area->iopt; + struct iopt_pages *pages = area->pages; + struct iopt_area *lhs; + struct iopt_area *rhs; + int rc; + + lockdep_assert_held_write(&iopt->iova_rwsem); + + if (iova == start_iova || iova == last_iova) + return 0; + + if (!pages || area->prevent_access) + return -EBUSY; + + if (new_start & (alignment - 1) || + iopt_area_start_byte(area, new_start) & (alignment - 1)) + return -EINVAL; + + lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!lhs) + return -ENOMEM; + + rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT); + if (!rhs) { + rc = -ENOMEM; + goto err_free_lhs; + } + + mutex_lock(&pages->mutex); + /* + * Splitting is not permitted if an access exists, we don't track enough + * information to split existing accesses. + */ + if (area->num_accesses) { + rc = -EINVAL; + goto err_unlock; + } + + /* + * Splitting is not permitted if a domain could have been mapped with + * huge pages. + */ + if (area->storage_domain && !iopt->disable_large_pages) { + rc = -EINVAL; + goto err_unlock; + } + + interval_tree_remove(&area->node, &iopt->area_itree); + rc = iopt_insert_area(iopt, lhs, area->pages, start_iova, + iopt_area_start_byte(area, start_iova), + (new_start - 1) - start_iova + 1, + area->iommu_prot); + if (WARN_ON(rc)) + goto err_insert; + + rc = iopt_insert_area(iopt, rhs, area->pages, new_start, + iopt_area_start_byte(area, new_start), + last_iova - new_start + 1, area->iommu_prot); + if (WARN_ON(rc)) + goto err_remove_lhs; + + lhs->storage_domain = area->storage_domain; + lhs->pages = area->pages; + rhs->storage_domain = area->storage_domain; + rhs->pages = area->pages; + kref_get(&rhs->pages->kref); + kfree(area); + mutex_unlock(&pages->mutex); + + /* + * No change to domains or accesses because the pages hasn't been + * changed + */ + return 0; + +err_remove_lhs: + interval_tree_remove(&lhs->node, &iopt->area_itree); +err_insert: + interval_tree_insert(&area->node, &iopt->area_itree); +err_unlock: + mutex_unlock(&pages->mutex); + kfree(rhs); +err_free_lhs: + kfree(lhs); + return rc; +} + +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas) +{ + int rc = 0; + int i; + + down_write(&iopt->iova_rwsem); + for (i = 0; i < num_iovas; i++) { + struct iopt_area *area; + + area = iopt_area_iter_first(iopt, iovas[i], iovas[i]); + if (!area) + continue; + rc = iopt_area_split(area, iovas[i]); + if (rc) + break; + } + up_write(&iopt->iova_rwsem); + return rc; +} + +void iopt_enable_large_pages(struct io_pagetable *iopt) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WRITE_ONCE(iopt->disable_large_pages, false); + rc = iopt_calculate_iova_alignment(iopt); + WARN_ON(rc); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +int iopt_disable_large_pages(struct io_pagetable *iopt) +{ + int rc = 0; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + if (iopt->disable_large_pages) + goto out_unlock; + + /* Won't do it if domains already have pages mapped in them */ + if (!xa_empty(&iopt->domains) && + !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { + rc = -EINVAL; + goto out_unlock; + } + + WRITE_ONCE(iopt->disable_large_pages, true); + rc = iopt_calculate_iova_alignment(iopt); + if (rc) + WRITE_ONCE(iopt->disable_large_pages, false); +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) +{ + int rc; + + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access, + xa_limit_16b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_unlock; + + rc = iopt_calculate_iova_alignment(iopt); + if (rc) { + xa_erase(&iopt->access_list, access->iopt_access_list_id); + goto out_unlock; + } + +out_unlock: + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); + return rc; +} + +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access) +{ + down_write(&iopt->domains_rwsem); + down_write(&iopt->iova_rwsem); + WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) != + access); + WARN_ON(iopt_calculate_iova_alignment(iopt)); + up_write(&iopt->iova_rwsem); + up_write(&iopt->domains_rwsem); +} + +/* Narrow the valid_iova_itree to include reserved ranges from a group. */ +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start) +{ + struct iommu_resv_region *resv; + struct iommu_resv_region *tmp; + LIST_HEAD(group_resv_regions); + unsigned int num_hw_msi = 0; + unsigned int num_sw_msi = 0; + int rc; + + down_write(&iopt->iova_rwsem); + rc = iommu_get_group_resv_regions(group, &group_resv_regions); + if (rc) + goto out_unlock; + + list_for_each_entry(resv, &group_resv_regions, list) { + if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) + continue; + + if (sw_msi_start && resv->type == IOMMU_RESV_MSI) + num_hw_msi++; + if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { + *sw_msi_start = resv->start; + num_sw_msi++; + } + + rc = iopt_reserve_iova(iopt, resv->start, + resv->length - 1 + resv->start, device); + if (rc) + goto out_reserved; + } + + /* Drivers must offer sane combinations of regions */ + if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { + rc = -EINVAL; + goto out_reserved; + } + + rc = 0; + goto out_free_resv; + +out_reserved: + __iopt_remove_reserved_iova(iopt, device); +out_free_resv: + list_for_each_entry_safe(resv, tmp, &group_resv_regions, list) + kfree(resv); +out_unlock: + up_write(&iopt->iova_rwsem); + return rc; +} diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h new file mode 100644 index 000000000000..0ec3509b7e33 --- /dev/null +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -0,0 +1,241 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + */ +#ifndef __IO_PAGETABLE_H +#define __IO_PAGETABLE_H + +#include <linux/interval_tree.h> +#include <linux/mutex.h> +#include <linux/kref.h> +#include <linux/xarray.h> + +#include "iommufd_private.h" + +struct iommu_domain; + +/* + * Each io_pagetable is composed of intervals of areas which cover regions of + * the iova that are backed by something. iova not covered by areas is not + * populated in the page table. Each area is fully populated with pages. + * + * iovas are in byte units, but must be iopt->iova_alignment aligned. + * + * pages can be NULL, this means some other thread is still working on setting + * up or tearing down the area. When observed under the write side of the + * domain_rwsem a NULL pages must mean the area is still being setup and no + * domains are filled. + * + * storage_domain points at an arbitrary iommu_domain that is holding the PFNs + * for this area. It is locked by the pages->mutex. This simplifies the locking + * as the pages code can rely on the storage_domain without having to get the + * iopt->domains_rwsem. + * + * The io_pagetable::iova_rwsem protects node + * The iopt_pages::mutex protects pages_node + * iopt and iommu_prot are immutable + * The pages::mutex protects num_accesses + */ +struct iopt_area { + struct interval_tree_node node; + struct interval_tree_node pages_node; + struct io_pagetable *iopt; + struct iopt_pages *pages; + struct iommu_domain *storage_domain; + /* How many bytes into the first page the area starts */ + unsigned int page_offset; + /* IOMMU_READ, IOMMU_WRITE, etc */ + int iommu_prot; + bool prevent_access : 1; + unsigned int num_accesses; +}; + +struct iopt_allowed { + struct interval_tree_node node; +}; + +struct iopt_reserved { + struct interval_tree_node node; + void *owner; +}; + +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); + +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain); +void iopt_area_unmap_domain(struct iopt_area *area, + struct iommu_domain *domain); + +static inline unsigned long iopt_area_index(struct iopt_area *area) +{ + return area->pages_node.start; +} + +static inline unsigned long iopt_area_last_index(struct iopt_area *area) +{ + return area->pages_node.last; +} + +static inline unsigned long iopt_area_iova(struct iopt_area *area) +{ + return area->node.start; +} + +static inline unsigned long iopt_area_last_iova(struct iopt_area *area) +{ + return area->node.last; +} + +static inline size_t iopt_area_length(struct iopt_area *area) +{ + return (area->node.last - area->node.start) + 1; +} + +/* + * Number of bytes from the start of the iopt_pages that the iova begins. + * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index + * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page + */ +static inline unsigned long iopt_area_start_byte(struct iopt_area *area, + unsigned long iova) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(iova < iopt_area_iova(area) || + iova > iopt_area_last_iova(area)); + return (iova - iopt_area_iova(area)) + area->page_offset + + iopt_area_index(area) * PAGE_SIZE; +} + +static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, + unsigned long iova) +{ + return iopt_area_start_byte(area, iova) / PAGE_SIZE; +} + +#define __make_iopt_iter(name) \ + static inline struct iopt_##name *iopt_##name##_iter_first( \ + struct io_pagetable *iopt, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + lockdep_assert_held(&iopt->iova_rwsem); \ + node = interval_tree_iter_first(&iopt->name##_itree, start, \ + last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } \ + static inline struct iopt_##name *iopt_##name##_iter_next( \ + struct iopt_##name *last_node, unsigned long start, \ + unsigned long last) \ + { \ + struct interval_tree_node *node; \ + \ + node = interval_tree_iter_next(&last_node->node, start, last); \ + if (!node) \ + return NULL; \ + return container_of(node, struct iopt_##name, node); \ + } + +__make_iopt_iter(area) +__make_iopt_iter(allowed) +__make_iopt_iter(reserved) + +struct iopt_area_contig_iter { + unsigned long cur_iova; + unsigned long last_iova; + struct iopt_area *area; +}; +struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, + struct io_pagetable *iopt, + unsigned long iova, + unsigned long last_iova); +struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); + +static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) +{ + return iter->area && iter->last_iova <= iopt_area_last_iova(iter->area); +} + +/* + * Iterate over a contiguous list of areas that span the iova,last_iova range. + * The caller must check iopt_area_contig_done() after the loop to see if + * contiguous areas existed. + */ +#define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ + for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ + area = iopt_area_contig_next(iter)) + +enum { + IOPT_PAGES_ACCOUNT_NONE = 0, + IOPT_PAGES_ACCOUNT_USER = 1, + IOPT_PAGES_ACCOUNT_MM = 2, +}; + +/* + * This holds a pinned page list for multiple areas of IO address space. The + * pages always originate from a linear chunk of userspace VA. Multiple + * io_pagetable's, through their iopt_area's, can share a single iopt_pages + * which avoids multi-pinning and double accounting of page consumption. + * + * indexes in this structure are measured in PAGE_SIZE units, are 0 based from + * the start of the uptr and extend to npages. pages are pinned dynamically + * according to the intervals in the access_itree and domains_itree, npinned + * records the current number of pages pinned. + */ +struct iopt_pages { + struct kref kref; + struct mutex mutex; + size_t npages; + size_t npinned; + size_t last_npinned; + struct task_struct *source_task; + struct mm_struct *source_mm; + struct user_struct *source_user; + void __user *uptr; + bool writable:1; + u8 account_mode; + + struct xarray pinned_pfns; + /* Of iopt_pages_access::node */ + struct rb_root_cached access_itree; + /* Of iopt_area::pages_node */ + struct rb_root_cached domains_itree; +}; + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable); +void iopt_release_pages(struct kref *kref); +static inline void iopt_put_pages(struct iopt_pages *pages) +{ + kref_put(&pages->kref, iopt_release_pages); +} + +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last, struct page **out_pages); +void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, + unsigned long last); + +int iopt_area_add_access(struct iopt_area *area, unsigned long start, + unsigned long last, struct page **out_pages, + unsigned int flags); +void iopt_area_remove_access(struct iopt_area *area, unsigned long start, + unsigned long last); +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags); + +/* + * Each interval represents an active iopt_access_pages(), it acts as an + * interval lock that keeps the PFNs pinned and stored in the xarray. + */ +struct iopt_pages_access { + struct interval_tree_node node; + unsigned int users; +}; + +#endif diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c new file mode 100644 index 000000000000..31577e9d434f --- /dev/null +++ b/drivers/iommu/iommufd/ioas.c @@ -0,0 +1,398 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/interval_tree.h> +#include <linux/iommufd.h> +#include <linux/iommu.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" + +void iommufd_ioas_destroy(struct iommufd_object *obj) +{ + struct iommufd_ioas *ioas = container_of(obj, struct iommufd_ioas, obj); + int rc; + + rc = iopt_unmap_all(&ioas->iopt, NULL); + WARN_ON(rc && rc != -ENOENT); + iopt_destroy_table(&ioas->iopt); + mutex_destroy(&ioas->mutex); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas; + + ioas = iommufd_object_alloc(ictx, ioas, IOMMUFD_OBJ_IOAS); + if (IS_ERR(ioas)) + return ioas; + + iopt_init_table(&ioas->iopt); + INIT_LIST_HEAD(&ioas->hwpt_list); + mutex_init(&ioas->mutex); + return ioas; +} + +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_alloc *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc; + + if (cmd->flags) + return -EOPNOTSUPP; + + ioas = iommufd_ioas_alloc(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + cmd->out_ioas_id = ioas->obj.id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_table; + iommufd_object_finalize(ucmd->ictx, &ioas->obj); + return 0; + +out_table: + iommufd_object_abort_and_destroy(ucmd->ictx, &ioas->obj); + return rc; +} + +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd) +{ + struct iommu_iova_range __user *ranges; + struct iommu_ioas_iova_ranges *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + struct interval_tree_span_iter span; + u32 max_iovas; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + down_read(&ioas->iopt.iova_rwsem); + max_iovas = cmd->num_iovas; + ranges = u64_to_user_ptr(cmd->allowed_iovas); + cmd->num_iovas = 0; + cmd->out_iova_alignment = ioas->iopt.iova_alignment; + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + if (!span.is_hole) + continue; + if (cmd->num_iovas < max_iovas) { + struct iommu_iova_range elm = { + .start = span.start_hole, + .last = span.last_hole, + }; + + if (copy_to_user(&ranges[cmd->num_iovas], &elm, + sizeof(elm))) { + rc = -EFAULT; + goto out_put; + } + } + cmd->num_iovas++; + } + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_put; + if (cmd->num_iovas > max_iovas) + rc = -EMSGSIZE; +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_ioas_load_iovas(struct rb_root_cached *itree, + struct iommu_iova_range __user *ranges, + u32 num) +{ + u32 i; + + for (i = 0; i != num; i++) { + struct iommu_iova_range range; + struct iopt_allowed *allowed; + + if (copy_from_user(&range, ranges + i, sizeof(range))) + return -EFAULT; + + if (range.start >= range.last) + return -EINVAL; + + if (interval_tree_iter_first(itree, range.start, range.last)) + return -EINVAL; + + allowed = kzalloc(sizeof(*allowed), GFP_KERNEL_ACCOUNT); + if (!allowed) + return -ENOMEM; + allowed->node.start = range.start; + allowed->node.last = range.last; + + interval_tree_insert(&allowed->node, itree); + } + return 0; +} + +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_allow_iovas *cmd = ucmd->cmd; + struct rb_root_cached allowed_iova = RB_ROOT_CACHED; + struct interval_tree_node *node; + struct iommufd_ioas *ioas; + struct io_pagetable *iopt; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + iopt = &ioas->iopt; + + rc = iommufd_ioas_load_iovas(&allowed_iova, + u64_to_user_ptr(cmd->allowed_iovas), + cmd->num_iovas); + if (rc) + goto out_free; + + /* + * We want the allowed tree update to be atomic, so we have to keep the + * original nodes around, and keep track of the new nodes as we allocate + * memory for them. The simplest solution is to have a new/old tree and + * then swap new for old. On success we free the old tree, on failure we + * free the new tree. + */ + rc = iopt_set_allow_iova(iopt, &allowed_iova); +out_free: + while ((node = interval_tree_iter_first(&allowed_iova, 0, ULONG_MAX))) { + interval_tree_remove(node, &allowed_iova); + kfree(container_of(node, struct iopt_allowed, node)); + } + iommufd_put_object(&ioas->obj); + return rc; +} + +static int conv_iommu_prot(u32 map_flags) +{ + /* + * We provide no manual cache coherency ioctls to userspace and most + * architectures make the CPU ops for cache flushing privileged. + * Therefore we require the underlying IOMMU to support CPU coherent + * operation. Support for IOMMU_CACHE is enforced by the + * IOMMU_CAP_CACHE_COHERENCY test during bind. + */ + int iommu_prot = IOMMU_CACHE; + + if (map_flags & IOMMU_IOAS_MAP_WRITEABLE) + iommu_prot |= IOMMU_WRITE; + if (map_flags & IOMMU_IOAS_MAP_READABLE) + iommu_prot |= IOMMU_READ; + return iommu_prot; +} + +int iommufd_ioas_map(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_map *cmd = ucmd->cmd; + unsigned long iova = cmd->iova; + struct iommufd_ioas *ioas; + unsigned int flags = 0; + int rc; + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE)) || + cmd->__reserved) + return -EOPNOTSUPP; + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) + return -EOVERFLOW; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + rc = iopt_map_user_pages(ucmd->ictx, &ioas->iopt, &iova, + u64_to_user_ptr(cmd->user_va), cmd->length, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put; + + cmd->iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_copy *cmd = ucmd->cmd; + struct iommufd_ioas *src_ioas; + struct iommufd_ioas *dst_ioas; + unsigned int flags = 0; + LIST_HEAD(pages_list); + unsigned long iova; + int rc; + + iommufd_test_syz_conv_iova_id(ucmd, cmd->src_ioas_id, &cmd->src_iova, + &cmd->flags); + + if ((cmd->flags & + ~(IOMMU_IOAS_MAP_FIXED_IOVA | IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_READABLE))) + return -EOPNOTSUPP; + if (cmd->length >= ULONG_MAX || cmd->src_iova >= ULONG_MAX || + cmd->dst_iova >= ULONG_MAX) + return -EOVERFLOW; + + src_ioas = iommufd_get_ioas(ucmd, cmd->src_ioas_id); + if (IS_ERR(src_ioas)) + return PTR_ERR(src_ioas); + rc = iopt_get_pages(&src_ioas->iopt, cmd->src_iova, cmd->length, + &pages_list); + iommufd_put_object(&src_ioas->obj); + if (rc) + return rc; + + dst_ioas = iommufd_get_ioas(ucmd, cmd->dst_ioas_id); + if (IS_ERR(dst_ioas)) { + rc = PTR_ERR(dst_ioas); + goto out_pages; + } + + if (!(cmd->flags & IOMMU_IOAS_MAP_FIXED_IOVA)) + flags = IOPT_ALLOC_IOVA; + iova = cmd->dst_iova; + rc = iopt_map_pages(&dst_ioas->iopt, &pages_list, cmd->length, &iova, + conv_iommu_prot(cmd->flags), flags); + if (rc) + goto out_put_dst; + + cmd->dst_iova = iova; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); +out_put_dst: + iommufd_put_object(&dst_ioas->obj); +out_pages: + iopt_free_pages_list(&pages_list); + return rc; +} + +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd) +{ + struct iommu_ioas_unmap *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + unsigned long unmapped = 0; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (cmd->iova == 0 && cmd->length == U64_MAX) { + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + if (rc) + goto out_put; + } else { + if (cmd->iova >= ULONG_MAX || cmd->length >= ULONG_MAX) { + rc = -EOVERFLOW; + goto out_put; + } + rc = iopt_unmap_iova(&ioas->iopt, cmd->iova, cmd->length, + &unmapped); + if (rc) + goto out_put; + } + + cmd->length = unmapped; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx) +{ + if (cmd->object_id) + return -EOPNOTSUPP; + + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = ictx->account_mode == IOPT_PAGES_ACCOUNT_MM; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + int rc = 0; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + xa_lock(&ictx->objects); + if (!xa_empty(&ictx->objects)) { + rc = -EBUSY; + } else { + if (cmd->val64 == 0) + ictx->account_mode = IOPT_PAGES_ACCOUNT_USER; + else if (cmd->val64 == 1) + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + else + rc = -EINVAL; + } + xa_unlock(&ictx->objects); + + return rc; + } + return -EOPNOTSUPP; +} + +static int iommufd_ioas_option_huge_pages(struct iommu_option *cmd, + struct iommufd_ioas *ioas) +{ + if (cmd->op == IOMMU_OPTION_OP_GET) { + cmd->val64 = !ioas->iopt.disable_large_pages; + return 0; + } + if (cmd->op == IOMMU_OPTION_OP_SET) { + if (cmd->val64 == 0) + return iopt_disable_large_pages(&ioas->iopt); + if (cmd->val64 == 1) { + iopt_enable_large_pages(&ioas->iopt); + return 0; + } + return -EINVAL; + } + return -EOPNOTSUPP; +} + +int iommufd_ioas_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + int rc = 0; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + ioas = iommufd_get_ioas(ucmd, cmd->object_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + switch (cmd->option_id) { + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option_huge_pages(cmd, ioas); + break; + default: + rc = -EOPNOTSUPP; + } + + iommufd_put_object(&ioas->obj); + return rc; +} diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h new file mode 100644 index 000000000000..222e86591f8a --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -0,0 +1,307 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#ifndef __IOMMUFD_PRIVATE_H +#define __IOMMUFD_PRIVATE_H + +#include <linux/rwsem.h> +#include <linux/xarray.h> +#include <linux/refcount.h> +#include <linux/uaccess.h> + +struct iommu_domain; +struct iommu_group; +struct iommu_option; + +struct iommufd_ctx { + struct file *file; + struct xarray objects; + + u8 account_mode; + struct iommufd_ioas *vfio_ioas; +}; + +/* + * The IOVA to PFN map. The map automatically copies the PFNs into multiple + * domains and permits sharing of PFNs between io_pagetable instances. This + * supports both a design where IOAS's are 1:1 with a domain (eg because the + * domain is HW customized), or where the IOAS is 1:N with multiple generic + * domains. The io_pagetable holds an interval tree of iopt_areas which point + * to shared iopt_pages which hold the pfns mapped to the page table. + * + * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex + */ +struct io_pagetable { + struct rw_semaphore domains_rwsem; + struct xarray domains; + struct xarray access_list; + unsigned int next_domain_id; + + struct rw_semaphore iova_rwsem; + struct rb_root_cached area_itree; + /* IOVA that cannot become reserved, struct iopt_allowed */ + struct rb_root_cached allowed_itree; + /* IOVA that cannot be allocated, struct iopt_reserved */ + struct rb_root_cached reserved_itree; + u8 disable_large_pages; + unsigned long iova_alignment; +}; + +void iopt_init_table(struct io_pagetable *iopt); +void iopt_destroy_table(struct io_pagetable *iopt); +int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, struct list_head *pages_list); +void iopt_free_pages_list(struct list_head *pages_list); +enum { + IOPT_ALLOC_IOVA = 1 << 0, +}; +int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, + unsigned long *iova, void __user *uptr, + unsigned long length, int iommu_prot, + unsigned int flags); +int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, + unsigned long length, unsigned long *dst_iova, + int iommu_prot, unsigned int flags); +int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, + unsigned long length, unsigned long *unmapped); +int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); + +void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, + unsigned long length); +int iopt_table_add_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +void iopt_table_remove_domain(struct io_pagetable *iopt, + struct iommu_domain *domain); +int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt, + struct device *device, + struct iommu_group *group, + phys_addr_t *sw_msi_start); +int iopt_set_allow_iova(struct io_pagetable *iopt, + struct rb_root_cached *allowed_iova); +int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, + unsigned long last, void *owner); +void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); +int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, + size_t num_iovas); +void iopt_enable_large_pages(struct io_pagetable *iopt); +int iopt_disable_large_pages(struct io_pagetable *iopt); + +struct iommufd_ucmd { + struct iommufd_ctx *ictx; + void __user *ubuffer; + u32 user_size; + void *cmd; +}; + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg); + +/* Copy the response in ucmd->cmd back to userspace. */ +static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, + size_t cmd_len) +{ + if (copy_to_user(ucmd->ubuffer, ucmd->cmd, + min_t(size_t, ucmd->user_size, cmd_len))) + return -EFAULT; + return 0; +} + +enum iommufd_object_type { + IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, + IOMMUFD_OBJ_DEVICE, + IOMMUFD_OBJ_HW_PAGETABLE, + IOMMUFD_OBJ_IOAS, + IOMMUFD_OBJ_ACCESS, +#ifdef CONFIG_IOMMUFD_TEST + IOMMUFD_OBJ_SELFTEST, +#endif +}; + +/* Base struct for all objects with a userspace ID handle. */ +struct iommufd_object { + struct rw_semaphore destroy_rwsem; + refcount_t users; + enum iommufd_object_type type; + unsigned int id; +}; + +static inline bool iommufd_lock_obj(struct iommufd_object *obj) +{ + if (!down_read_trylock(&obj->destroy_rwsem)) + return false; + if (!refcount_inc_not_zero(&obj->users)) { + up_read(&obj->destroy_rwsem); + return false; + } + return true; +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type); +static inline void iommufd_put_object(struct iommufd_object *obj) +{ + refcount_dec(&obj->users); + up_read(&obj->destroy_rwsem); +} + +/** + * iommufd_ref_to_users() - Switch from destroy_rwsem to users refcount + * protection + * @obj - Object to release + * + * Objects have two refcount protections (destroy_rwsem and the refcount_t + * users). Holding either of these will prevent the object from being destroyed. + * + * Depending on the use case, one protection or the other is appropriate. In + * most cases references are being protected by the destroy_rwsem. This allows + * orderly destruction of the object because iommufd_object_destroy_user() will + * wait for it to become unlocked. However, as a rwsem, it cannot be held across + * a system call return. So cases that have longer term needs must switch + * to the weaker users refcount_t. + * + * With users protection iommufd_object_destroy_user() will return false, + * refusing to destroy the object, causing -EBUSY to userspace. + */ +static inline void iommufd_ref_to_users(struct iommufd_object *obj) +{ + up_read(&obj->destroy_rwsem); + /* iommufd_lock_obj() obtains users as well */ +} +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj); +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type); + +#define iommufd_object_alloc(ictx, ptr, type) \ + container_of(_iommufd_object_alloc( \ + ictx, \ + sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ + offsetof(typeof(*(ptr)), \ + obj) != 0), \ + type), \ + typeof(*(ptr)), obj) + +/* + * The IO Address Space (IOAS) pagetable is a virtual page table backed by the + * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The + * mapping is copied into all of the associated domains and made available to + * in-kernel users. + * + * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable + * object. When we go to attach a device to an IOAS we need to get an + * iommu_domain and wrapping iommufd_hw_pagetable for it. + * + * An iommu_domain & iommfd_hw_pagetable will be automatically selected + * for a device based on the hwpt_list. If no suitable iommu_domain + * is found a new iommu_domain will be created. + */ +struct iommufd_ioas { + struct iommufd_object obj; + struct io_pagetable iopt; + struct mutex mutex; + struct list_head hwpt_list; +}; + +static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ucmd *ucmd, + u32 id) +{ + return container_of(iommufd_get_object(ucmd->ictx, id, + IOMMUFD_OBJ_IOAS), + struct iommufd_ioas, obj); +} + +struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); +int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); +void iommufd_ioas_destroy(struct iommufd_object *obj); +int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); +int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); +int iommufd_ioas_map(struct iommufd_ucmd *ucmd); +int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); +int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); +int iommufd_ioas_option(struct iommufd_ucmd *ucmd); +int iommufd_option_rlimit_mode(struct iommu_option *cmd, + struct iommufd_ctx *ictx); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); + +/* + * A HW pagetable is called an iommu_domain inside the kernel. This user object + * allows directly creating and inspecting the domains. Domains that have kernel + * owned page tables will be associated with an iommufd_ioas that provides the + * IOVA to PFN map. + */ +struct iommufd_hw_pagetable { + struct iommufd_object obj; + struct iommufd_ioas *ioas; + struct iommu_domain *domain; + bool auto_domain : 1; + bool enforce_cache_coherency : 1; + bool msi_cookie : 1; + /* Head at iommufd_ioas::hwpt_list */ + struct list_head hwpt_item; + struct mutex devices_lock; + struct list_head devices; +}; + +struct iommufd_hw_pagetable * +iommufd_hw_pagetable_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, + struct device *dev); +void iommufd_hw_pagetable_destroy(struct iommufd_object *obj); + +void iommufd_device_destroy(struct iommufd_object *obj); + +struct iommufd_access { + struct iommufd_object obj; + struct iommufd_ctx *ictx; + struct iommufd_ioas *ioas; + const struct iommufd_access_ops *ops; + void *data; + unsigned long iova_alignment; + u32 iopt_access_list_id; +}; + +int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); +void iopt_remove_access(struct io_pagetable *iopt, + struct iommufd_access *access); +void iommufd_access_destroy_object(struct iommufd_object *obj); + +#ifdef CONFIG_IOMMUFD_TEST +struct iommufd_hw_pagetable * +iommufd_device_selftest_attach(struct iommufd_ctx *ictx, + struct iommufd_ioas *ioas, + struct device *mock_dev); +void iommufd_device_selftest_detach(struct iommufd_ctx *ictx, + struct iommufd_hw_pagetable *hwpt); +int iommufd_test(struct iommufd_ucmd *ucmd); +void iommufd_selftest_destroy(struct iommufd_object *obj); +extern size_t iommufd_test_memory_limit; +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags); +bool iommufd_should_fail(void); +void __init iommufd_test_init(void); +void iommufd_test_exit(void); +#else +static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, + u64 *iova, u32 *flags) +{ +} +static inline bool iommufd_should_fail(void) +{ + return false; +} +static inline void __init iommufd_test_init(void) +{ +} +static inline void iommufd_test_exit(void) +{ +} +#endif +#endif diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h new file mode 100644 index 000000000000..1d96a8f466fd --- /dev/null +++ b/drivers/iommu/iommufd/iommufd_test.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + */ +#ifndef _UAPI_IOMMUFD_TEST_H +#define _UAPI_IOMMUFD_TEST_H + +#include <linux/types.h> +#include <linux/iommufd.h> + +enum { + IOMMU_TEST_OP_ADD_RESERVED = 1, + IOMMU_TEST_OP_MOCK_DOMAIN, + IOMMU_TEST_OP_MD_CHECK_MAP, + IOMMU_TEST_OP_MD_CHECK_REFS, + IOMMU_TEST_OP_CREATE_ACCESS, + IOMMU_TEST_OP_DESTROY_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_PAGES, + IOMMU_TEST_OP_ACCESS_RW, + IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT, +}; + +enum { + MOCK_APERTURE_START = 1UL << 24, + MOCK_APERTURE_LAST = (1UL << 31) - 1, +}; + +enum { + MOCK_FLAGS_ACCESS_WRITE = 1 << 0, + MOCK_FLAGS_ACCESS_SYZ = 1 << 16, +}; + +enum { + MOCK_ACCESS_RW_WRITE = 1 << 0, + MOCK_ACCESS_RW_SLOW_PATH = 1 << 2, +}; + +enum { + MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES = 1 << 0, +}; + +struct iommu_test_cmd { + __u32 size; + __u32 op; + __u32 id; + __u32 __reserved; + union { + struct { + __aligned_u64 start; + __aligned_u64 length; + } add_reserved; + struct { + __u32 out_device_id; + __u32 out_hwpt_id; + } mock_domain; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } check_map; + struct { + __aligned_u64 length; + __aligned_u64 uptr; + __u32 refs; + } check_refs; + struct { + __u32 out_access_fd; + __u32 flags; + } create_access; + struct { + __u32 access_pages_id; + } destroy_access_pages; + struct { + __u32 flags; + __u32 out_access_pages_id; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + } access_pages; + struct { + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 uptr; + __u32 flags; + } access_rw; + struct { + __u32 limit; + } memory_limit; + }; + __u32 last; +}; +#define IOMMU_TEST_CMD _IO(IOMMUFD_TYPE, IOMMUFD_CMD_BASE + 32) + +#endif diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c new file mode 100644 index 000000000000..083e6fcbe10a --- /dev/null +++ b/drivers/iommu/iommufd/main.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2021 Intel Corporation + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + * + * iommufd provides control over the IOMMU HW objects created by IOMMU kernel + * drivers. IOMMU HW objects revolve around IO page tables that map incoming DMA + * addresses (IOVA) to CPU addresses. + */ +#define pr_fmt(fmt) "iommufd: " fmt + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/bug.h> +#include <uapi/linux/iommufd.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +struct iommufd_object_ops { + void (*destroy)(struct iommufd_object *obj); +}; +static const struct iommufd_object_ops iommufd_object_ops[]; +static struct miscdevice vfio_misc_dev; + +struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, + size_t size, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + int rc; + + obj = kzalloc(size, GFP_KERNEL_ACCOUNT); + if (!obj) + return ERR_PTR(-ENOMEM); + obj->type = type; + init_rwsem(&obj->destroy_rwsem); + refcount_set(&obj->users, 1); + + /* + * Reserve an ID in the xarray but do not publish the pointer yet since + * the caller hasn't initialized it yet. Once the pointer is published + * in the xarray and visible to other threads we can't reliably destroy + * it anymore, so the caller must complete all errorable operations + * before calling iommufd_object_finalize(). + */ + rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, + xa_limit_32b, GFP_KERNEL_ACCOUNT); + if (rc) + goto out_free; + return obj; +out_free: + kfree(obj); + return ERR_PTR(rc); +} + +/* + * Allow concurrent access to the object. + * + * Once another thread can see the object pointer it can prevent object + * destruction. Expect for special kernel-only objects there is no in-kernel way + * to reliably destroy a single object. Thus all APIs that are creating objects + * must use iommufd_object_abort() to handle their errors and only call + * iommufd_object_finalize() once object creation cannot fail. + */ +void iommufd_object_finalize(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + void *old; + + old = xa_store(&ictx->objects, obj->id, obj, GFP_KERNEL); + /* obj->id was returned from xa_alloc() so the xa_store() cannot fail */ + WARN_ON(old); +} + +/* Undo _iommufd_object_alloc() if iommufd_object_finalize() was not called */ +void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj) +{ + void *old; + + old = xa_erase(&ictx->objects, obj->id); + WARN_ON(old); + kfree(obj); +} + +/* + * Abort an object that has been fully initialized and needs destroy, but has + * not been finalized. + */ +void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + iommufd_object_ops[obj->type].destroy(obj); + iommufd_object_abort(ictx, obj); +} + +struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, + enum iommufd_object_type type) +{ + struct iommufd_object *obj; + + if (iommufd_should_fail()) + return ERR_PTR(-ENOENT); + + xa_lock(&ictx->objects); + obj = xa_load(&ictx->objects, id); + if (!obj || (type != IOMMUFD_OBJ_ANY && obj->type != type) || + !iommufd_lock_obj(obj)) + obj = ERR_PTR(-ENOENT); + xa_unlock(&ictx->objects); + return obj; +} + +/* + * The caller holds a users refcount and wants to destroy the object. Returns + * true if the object was destroyed. In all cases the caller no longer has a + * reference on obj. + */ +bool iommufd_object_destroy_user(struct iommufd_ctx *ictx, + struct iommufd_object *obj) +{ + /* + * The purpose of the destroy_rwsem is to ensure deterministic + * destruction of objects used by external drivers and destroyed by this + * function. Any temporary increment of the refcount must hold the read + * side of this, such as during ioctl execution. + */ + down_write(&obj->destroy_rwsem); + xa_lock(&ictx->objects); + refcount_dec(&obj->users); + if (!refcount_dec_if_one(&obj->users)) { + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + return false; + } + __xa_erase(&ictx->objects, obj->id); + if (ictx->vfio_ioas && &ictx->vfio_ioas->obj == obj) + ictx->vfio_ioas = NULL; + xa_unlock(&ictx->objects); + up_write(&obj->destroy_rwsem); + + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + return true; +} + +static int iommufd_destroy(struct iommufd_ucmd *ucmd) +{ + struct iommu_destroy *cmd = ucmd->cmd; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, cmd->id, IOMMUFD_OBJ_ANY); + if (IS_ERR(obj)) + return PTR_ERR(obj); + iommufd_ref_to_users(obj); + /* See iommufd_ref_to_users() */ + if (!iommufd_object_destroy_user(ucmd->ictx, obj)) + return -EBUSY; + return 0; +} + +static int iommufd_fops_open(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx; + + ictx = kzalloc(sizeof(*ictx), GFP_KERNEL_ACCOUNT); + if (!ictx) + return -ENOMEM; + + /* + * For compatibility with VFIO when /dev/vfio/vfio is opened we default + * to the same rlimit accounting as vfio uses. + */ + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) && + filp->private_data == &vfio_misc_dev) { + ictx->account_mode = IOPT_PAGES_ACCOUNT_MM; + pr_info_once("IOMMUFD is providing /dev/vfio/vfio, not VFIO.\n"); + } + + xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); + ictx->file = filp; + filp->private_data = ictx; + return 0; +} + +static int iommufd_fops_release(struct inode *inode, struct file *filp) +{ + struct iommufd_ctx *ictx = filp->private_data; + struct iommufd_object *obj; + + /* + * The objects in the xarray form a graph of "users" counts, and we have + * to destroy them in a depth first manner. Leaf objects will reduce the + * users count of interior objects when they are destroyed. + * + * Repeatedly destroying all the "1 users" leaf objects will progress + * until the entire list is destroyed. If this can't progress then there + * is some bug related to object refcounting. + */ + while (!xa_empty(&ictx->objects)) { + unsigned int destroyed = 0; + unsigned long index; + + xa_for_each(&ictx->objects, index, obj) { + if (!refcount_dec_if_one(&obj->users)) + continue; + destroyed++; + xa_erase(&ictx->objects, index); + iommufd_object_ops[obj->type].destroy(obj); + kfree(obj); + } + /* Bug related to users refcount */ + if (WARN_ON(!destroyed)) + break; + } + kfree(ictx); + return 0; +} + +static int iommufd_option(struct iommufd_ucmd *ucmd) +{ + struct iommu_option *cmd = ucmd->cmd; + int rc; + + if (cmd->__reserved) + return -EOPNOTSUPP; + + switch (cmd->option_id) { + case IOMMU_OPTION_RLIMIT_MODE: + rc = iommufd_option_rlimit_mode(cmd, ucmd->ictx); + break; + case IOMMU_OPTION_HUGE_PAGES: + rc = iommufd_ioas_option(ucmd); + break; + default: + return -EOPNOTSUPP; + } + if (rc) + return rc; + if (copy_to_user(&((struct iommu_option __user *)ucmd->ubuffer)->val64, + &cmd->val64, sizeof(cmd->val64))) + return -EFAULT; + return 0; +} + +union ucmd_buffer { + struct iommu_destroy destroy; + struct iommu_ioas_alloc alloc; + struct iommu_ioas_allow_iovas allow_iovas; + struct iommu_ioas_iova_ranges iova_ranges; + struct iommu_ioas_map map; + struct iommu_ioas_unmap unmap; +#ifdef CONFIG_IOMMUFD_TEST + struct iommu_test_cmd test; +#endif +}; + +struct iommufd_ioctl_op { + unsigned int size; + unsigned int min_size; + unsigned int ioctl_num; + int (*execute)(struct iommufd_ucmd *ucmd); +}; + +#define IOCTL_OP(_ioctl, _fn, _struct, _last) \ + [_IOC_NR(_ioctl) - IOMMUFD_CMD_BASE] = { \ + .size = sizeof(_struct) + \ + BUILD_BUG_ON_ZERO(sizeof(union ucmd_buffer) < \ + sizeof(_struct)), \ + .min_size = offsetofend(_struct, _last), \ + .ioctl_num = _ioctl, \ + .execute = _fn, \ + } +static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { + IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id), + IOCTL_OP(IOMMU_IOAS_ALLOC, iommufd_ioas_alloc_ioctl, + struct iommu_ioas_alloc, out_ioas_id), + IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas, + struct iommu_ioas_allow_iovas, allowed_iovas), + IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy, + src_iova), + IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges, + struct iommu_ioas_iova_ranges, out_iova_alignment), + IOCTL_OP(IOMMU_IOAS_MAP, iommufd_ioas_map, struct iommu_ioas_map, + iova), + IOCTL_OP(IOMMU_IOAS_UNMAP, iommufd_ioas_unmap, struct iommu_ioas_unmap, + length), + IOCTL_OP(IOMMU_OPTION, iommufd_option, struct iommu_option, + val64), + IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas, + __reserved), +#ifdef CONFIG_IOMMUFD_TEST + IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), +#endif +}; + +static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct iommufd_ctx *ictx = filp->private_data; + const struct iommufd_ioctl_op *op; + struct iommufd_ucmd ucmd = {}; + union ucmd_buffer buf; + unsigned int nr; + int ret; + + nr = _IOC_NR(cmd); + if (nr < IOMMUFD_CMD_BASE || + (nr - IOMMUFD_CMD_BASE) >= ARRAY_SIZE(iommufd_ioctl_ops)) + return iommufd_vfio_ioctl(ictx, cmd, arg); + + ucmd.ictx = ictx; + ucmd.ubuffer = (void __user *)arg; + ret = get_user(ucmd.user_size, (u32 __user *)ucmd.ubuffer); + if (ret) + return ret; + + op = &iommufd_ioctl_ops[nr - IOMMUFD_CMD_BASE]; + if (op->ioctl_num != cmd) + return -ENOIOCTLCMD; + if (ucmd.user_size < op->min_size) + return -EINVAL; + + ucmd.cmd = &buf; + ret = copy_struct_from_user(ucmd.cmd, op->size, ucmd.ubuffer, + ucmd.user_size); + if (ret) + return ret; + ret = op->execute(&ucmd); + return ret; +} + +static const struct file_operations iommufd_fops = { + .owner = THIS_MODULE, + .open = iommufd_fops_open, + .release = iommufd_fops_release, + .unlocked_ioctl = iommufd_fops_ioctl, +}; + +/** + * iommufd_ctx_get - Get a context reference + * @ictx: Context to get + * + * The caller must already hold a valid reference to ictx. + */ +void iommufd_ctx_get(struct iommufd_ctx *ictx) +{ + get_file(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_get, IOMMUFD); + +/** + * iommufd_ctx_from_file - Acquires a reference to the iommufd context + * @file: File to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. The struct file + * remains owned by the caller and the caller must still do fput. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) +{ + struct iommufd_ctx *ictx; + + if (file->f_op != &iommufd_fops) + return ERR_PTR(-EBADFD); + ictx = file->private_data; + iommufd_ctx_get(ictx); + return ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); + +/** + * iommufd_ctx_put - Put back a reference + * @ictx: Context to put back + */ +void iommufd_ctx_put(struct iommufd_ctx *ictx) +{ + fput(ictx->file); +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_put, IOMMUFD); + +static const struct iommufd_object_ops iommufd_object_ops[] = { + [IOMMUFD_OBJ_ACCESS] = { + .destroy = iommufd_access_destroy_object, + }, + [IOMMUFD_OBJ_DEVICE] = { + .destroy = iommufd_device_destroy, + }, + [IOMMUFD_OBJ_IOAS] = { + .destroy = iommufd_ioas_destroy, + }, + [IOMMUFD_OBJ_HW_PAGETABLE] = { + .destroy = iommufd_hw_pagetable_destroy, + }, +#ifdef CONFIG_IOMMUFD_TEST + [IOMMUFD_OBJ_SELFTEST] = { + .destroy = iommufd_selftest_destroy, + }, +#endif +}; + +static struct miscdevice iommu_misc_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "iommu", + .fops = &iommufd_fops, + .nodename = "iommu", + .mode = 0660, +}; + + +static struct miscdevice vfio_misc_dev = { + .minor = VFIO_MINOR, + .name = "vfio", + .fops = &iommufd_fops, + .nodename = "vfio/vfio", + .mode = 0666, +}; + +static int __init iommufd_init(void) +{ + int ret; + + ret = misc_register(&iommu_misc_dev); + if (ret) + return ret; + + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) { + ret = misc_register(&vfio_misc_dev); + if (ret) + goto err_misc; + } + iommufd_test_init(); + return 0; +err_misc: + misc_deregister(&iommu_misc_dev); + return ret; +} + +static void __exit iommufd_exit(void) +{ + iommufd_test_exit(); + if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) + misc_deregister(&vfio_misc_dev); + misc_deregister(&iommu_misc_dev); +} + +module_init(iommufd_init); +module_exit(iommufd_exit); + +#if IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER) +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); +#endif +MODULE_DESCRIPTION("I/O Address Space Management for passthrough devices"); +MODULE_LICENSE("GPL"); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c new file mode 100644 index 000000000000..1e1d3509efae --- /dev/null +++ b/drivers/iommu/iommufd/pages.c @@ -0,0 +1,1977 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * The iopt_pages is the center of the storage and motion of PFNs. Each + * iopt_pages represents a logical linear array of full PFNs. The array is 0 + * based and has npages in it. Accessors use 'index' to refer to the entry in + * this logical array, regardless of its storage location. + * + * PFNs are stored in a tiered scheme: + * 1) iopt_pages::pinned_pfns xarray + * 2) An iommu_domain + * 3) The origin of the PFNs, i.e. the userspace pointer + * + * PFN have to be copied between all combinations of tiers, depending on the + * configuration. + * + * When a PFN is taken out of the userspace pointer it is pinned exactly once. + * The storage locations of the PFN's index are tracked in the two interval + * trees. If no interval includes the index then it is not pinned. + * + * If access_itree includes the PFN's index then an in-kernel access has + * requested the page. The PFN is stored in the xarray so other requestors can + * continue to find it. + * + * If the domains_itree includes the PFN's index then an iommu_domain is storing + * the PFN and it can be read back using iommu_iova_to_phys(). To avoid + * duplicating storage the xarray is not used if only iommu_domains are using + * the PFN's index. + * + * As a general principle this is designed so that destroy never fails. This + * means removing an iommu_domain or releasing a in-kernel access will not fail + * due to insufficient memory. In practice this means some cases have to hold + * PFNs in the xarray even though they are also being stored in an iommu_domain. + * + * While the iopt_pages can use an iommu_domain as storage, it does not have an + * IOVA itself. Instead the iopt_area represents a range of IOVA and uses the + * iopt_pages as the PFN provider. Multiple iopt_areas can share the iopt_pages + * and reference their own slice of the PFN array, with sub page granularity. + * + * In this file the term 'last' indicates an inclusive and closed interval, eg + * [0,0] refers to a single PFN. 'end' means an open range, eg [0,0) refers to + * no PFNs. + * + * Be cautious of overflow. An IOVA can go all the way up to U64_MAX, so + * last_iova + 1 can overflow. An iopt_pages index will always be much less than + * ULONG_MAX so last_index + 1 cannot overflow. + */ +#include <linux/overflow.h> +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/sched/mm.h> +#include <linux/highmem.h> +#include <linux/kthread.h> +#include <linux/iommufd.h> + +#include "io_pagetable.h" +#include "double_span.h" + +#ifndef CONFIG_IOMMUFD_TEST +#define TEMP_MEMORY_LIMIT 65536 +#else +#define TEMP_MEMORY_LIMIT iommufd_test_memory_limit +#endif +#define BATCH_BACKUP_SIZE 32 + +/* + * More memory makes pin_user_pages() and the batching more efficient, but as + * this is only a performance optimization don't try too hard to get it. A 64k + * allocation can hold about 26M of 4k pages and 13G of 2M pages in an + * pfn_batch. Various destroy paths cannot fail and provide a small amount of + * stack memory as a backup contingency. If backup_len is given this cannot + * fail. + */ +static void *temp_kmalloc(size_t *size, void *backup, size_t backup_len) +{ + void *res; + + if (WARN_ON(*size == 0)) + return NULL; + + if (*size < backup_len) + return backup; + + if (!backup && iommufd_should_fail()) + return NULL; + + *size = min_t(size_t, *size, TEMP_MEMORY_LIMIT); + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = PAGE_SIZE; + if (backup_len) { + res = kmalloc(*size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + if (res) + return res; + *size = backup_len; + return backup; + } + return kmalloc(*size, GFP_KERNEL); +} + +void interval_tree_double_span_iter_update( + struct interval_tree_double_span_iter *iter) +{ + unsigned long last_hole = ULONG_MAX; + unsigned int i; + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) { + if (interval_tree_span_iter_done(&iter->spans[i])) { + iter->is_used = -1; + return; + } + + if (iter->spans[i].is_hole) { + last_hole = min(last_hole, iter->spans[i].last_hole); + continue; + } + + iter->is_used = i + 1; + iter->start_used = iter->spans[i].start_used; + iter->last_used = min(iter->spans[i].last_used, last_hole); + return; + } + + iter->is_used = 0; + iter->start_hole = iter->spans[0].start_hole; + iter->last_hole = + min(iter->spans[0].last_hole, iter->spans[1].last_hole); +} + +void interval_tree_double_span_iter_first( + struct interval_tree_double_span_iter *iter, + struct rb_root_cached *itree1, struct rb_root_cached *itree2, + unsigned long first_index, unsigned long last_index) +{ + unsigned int i; + + iter->itrees[0] = itree1; + iter->itrees[1] = itree2; + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_first(&iter->spans[i], iter->itrees[i], + first_index, last_index); + interval_tree_double_span_iter_update(iter); +} + +void interval_tree_double_span_iter_next( + struct interval_tree_double_span_iter *iter) +{ + unsigned int i; + + if (iter->is_used == -1 || + iter->last_hole == iter->spans[0].last_index) { + iter->is_used = -1; + return; + } + + for (i = 0; i != ARRAY_SIZE(iter->spans); i++) + interval_tree_span_iter_advance( + &iter->spans[i], iter->itrees[i], iter->last_hole + 1); + interval_tree_double_span_iter_update(iter); +} + +static void iopt_pages_add_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_add_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_sub_npinned(struct iopt_pages *pages, size_t npages) +{ + int rc; + + rc = check_sub_overflow(pages->npinned, npages, &pages->npinned); + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(rc || pages->npinned > pages->npages); +} + +static void iopt_pages_err_unpin(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **page_list) +{ + unsigned long npages = last_index - start_index + 1; + + unpin_user_pages(page_list, npages); + iopt_pages_sub_npinned(pages, npages); +} + +/* + * index is the number of PAGE_SIZE units from the start of the area's + * iopt_pages. If the iova is sub page-size then the area has an iova that + * covers a portion of the first and last pages in the range. + */ +static unsigned long iopt_area_index_to_iova(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + index -= iopt_area_index(area); + if (index == 0) + return iopt_area_iova(area); + return iopt_area_iova(area) - area->page_offset + index * PAGE_SIZE; +} + +static unsigned long iopt_area_index_to_iova_last(struct iopt_area *area, + unsigned long index) +{ + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(index < iopt_area_index(area) || + index > iopt_area_last_index(area)); + if (index == iopt_area_last_index(area)) + return iopt_area_last_iova(area); + return iopt_area_iova(area) - area->page_offset + + (index - iopt_area_index(area) + 1) * PAGE_SIZE - 1; +} + +static void iommu_unmap_nofail(struct iommu_domain *domain, unsigned long iova, + size_t size) +{ + size_t ret; + + ret = iommu_unmap(domain, iova, size); + /* + * It is a logic error in this code or a driver bug if the IOMMU unmaps + * something other than exactly as requested. This implies that the + * iommu driver may not fail unmap for reasons beyond bad agruments. + * Particularly, the iommu driver may not do a memory allocation on the + * unmap path. + */ + WARN_ON(ret != size); +} + +static void iopt_area_unmap_domain_range(struct iopt_area *area, + struct iommu_domain *domain, + unsigned long start_index, + unsigned long last_index) +{ + unsigned long start_iova = iopt_area_index_to_iova(area, start_index); + + iommu_unmap_nofail(domain, start_iova, + iopt_area_index_to_iova_last(area, last_index) - + start_iova + 1); +} + +static struct iopt_area *iopt_pages_find_domain_area(struct iopt_pages *pages, + unsigned long index) +{ + struct interval_tree_node *node; + + node = interval_tree_iter_first(&pages->domains_itree, index, index); + if (!node) + return NULL; + return container_of(node, struct iopt_area, pages_node); +} + +/* + * A simple datastructure to hold a vector of PFNs, optimized for contiguous + * PFNs. This is used as a temporary holding memory for shuttling pfns from one + * place to another. Generally everything is made more efficient if operations + * work on the largest possible grouping of pfns. eg fewer lock/unlock cycles, + * better cache locality, etc + */ +struct pfn_batch { + unsigned long *pfns; + u32 *npfns; + unsigned int array_size; + unsigned int end; + unsigned int total_pfns; +}; + +static void batch_clear(struct pfn_batch *batch) +{ + batch->total_pfns = 0; + batch->end = 0; + batch->pfns[0] = 0; + batch->npfns[0] = 0; +} + +/* + * Carry means we carry a portion of the final hugepage over to the front of the + * batch + */ +static void batch_clear_carry(struct pfn_batch *batch, unsigned int keep_pfns) +{ + if (!keep_pfns) + return batch_clear(batch); + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(!batch->end || + batch->npfns[batch->end - 1] < keep_pfns); + + batch->total_pfns = keep_pfns; + batch->npfns[0] = keep_pfns; + batch->pfns[0] = batch->pfns[batch->end - 1] + + (batch->npfns[batch->end - 1] - keep_pfns); + batch->end = 0; +} + +static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) +{ + if (!batch->total_pfns) + return; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(batch->total_pfns != batch->npfns[0]); + skip_pfns = min(batch->total_pfns, skip_pfns); + batch->pfns[0] += skip_pfns; + batch->npfns[0] -= skip_pfns; + batch->total_pfns -= skip_pfns; +} + +static int __batch_init(struct pfn_batch *batch, size_t max_pages, void *backup, + size_t backup_len) +{ + const size_t elmsz = sizeof(*batch->pfns) + sizeof(*batch->npfns); + size_t size = max_pages * elmsz; + + batch->pfns = temp_kmalloc(&size, backup, backup_len); + if (!batch->pfns) + return -ENOMEM; + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && WARN_ON(size < elmsz)) + return -EINVAL; + batch->array_size = size / elmsz; + batch->npfns = (u32 *)(batch->pfns + batch->array_size); + batch_clear(batch); + return 0; +} + +static int batch_init(struct pfn_batch *batch, size_t max_pages) +{ + return __batch_init(batch, max_pages, NULL, 0); +} + +static void batch_init_backup(struct pfn_batch *batch, size_t max_pages, + void *backup, size_t backup_len) +{ + __batch_init(batch, max_pages, backup, backup_len); +} + +static void batch_destroy(struct pfn_batch *batch, void *backup) +{ + if (batch->pfns != backup) + kfree(batch->pfns); +} + +/* true if the pfn was added, false otherwise */ +static bool batch_add_pfn(struct pfn_batch *batch, unsigned long pfn) +{ + const unsigned int MAX_NPFNS = type_max(typeof(*batch->npfns)); + + if (batch->end && + pfn == batch->pfns[batch->end - 1] + batch->npfns[batch->end - 1] && + batch->npfns[batch->end - 1] != MAX_NPFNS) { + batch->npfns[batch->end - 1]++; + batch->total_pfns++; + return true; + } + if (batch->end == batch->array_size) + return false; + batch->total_pfns++; + batch->pfns[batch->end] = pfn; + batch->npfns[batch->end] = 1; + batch->end++; + return true; +} + +/* + * Fill the batch with pfns from the domain. When the batch is full, or it + * reaches last_index, the function will return. The caller should use + * batch->total_pfns to determine the starting point for the next iteration. + */ +static void batch_from_domain(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + /* + * This is pretty slow, it would be nice to get the page size + * back from the driver, or have the driver directly fill the + * batch. + */ + phys = iommu_iova_to_phys(domain, iova) - page_offset; + if (!batch_add_pfn(batch, PHYS_PFN(phys))) + return; + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } +} + +static struct page **raw_pages_from_domain(struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned int page_offset = 0; + unsigned long iova; + phys_addr_t phys; + + iova = iopt_area_index_to_iova(area, start_index); + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + while (start_index <= last_index) { + phys = iommu_iova_to_phys(domain, iova) - page_offset; + *(out_pages++) = pfn_to_page(PHYS_PFN(phys)); + iova += PAGE_SIZE - page_offset; + page_offset = 0; + start_index++; + } + return out_pages; +} + +/* Continues reading a domain until we reach a discontinuity in the pfns. */ +static void batch_from_domain_continue(struct pfn_batch *batch, + struct iommu_domain *domain, + struct iopt_area *area, + unsigned long start_index, + unsigned long last_index) +{ + unsigned int array_size = batch->array_size; + + batch->array_size = batch->end; + batch_from_domain(batch, domain, area, start_index, last_index); + batch->array_size = array_size; +} + +/* + * This is part of the VFIO compatibility support for VFIO_TYPE1_IOMMU. That + * mode permits splitting a mapped area up, and then one of the splits is + * unmapped. Doing this normally would cause us to violate our invariant of + * pairing map/unmap. Thus, to support old VFIO compatibility disable support + * for batching consecutive PFNs. All PFNs mapped into the iommu are done in + * PAGE_SIZE units, not larger or smaller. + */ +static int batch_iommu_map_small(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t size, int prot) +{ + unsigned long start_iova = iova; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) + WARN_ON(paddr % PAGE_SIZE || iova % PAGE_SIZE || + size % PAGE_SIZE); + + while (size) { + rc = iommu_map(domain, iova, paddr, PAGE_SIZE, prot); + if (rc) + goto err_unmap; + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + size -= PAGE_SIZE; + } + return 0; + +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static int batch_to_domain(struct pfn_batch *batch, struct iommu_domain *domain, + struct iopt_area *area, unsigned long start_index) +{ + bool disable_large_pages = area->iopt->disable_large_pages; + unsigned long last_iova = iopt_area_last_iova(area); + unsigned int page_offset = 0; + unsigned long start_iova; + unsigned long next_iova; + unsigned int cur = 0; + unsigned long iova; + int rc; + + /* The first index might be a partial page */ + if (start_index == iopt_area_index(area)) + page_offset = area->page_offset; + next_iova = iova = start_iova = + iopt_area_index_to_iova(area, start_index); + while (cur < batch->end) { + next_iova = min(last_iova + 1, + next_iova + batch->npfns[cur] * PAGE_SIZE - + page_offset); + if (disable_large_pages) + rc = batch_iommu_map_small( + domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + else + rc = iommu_map(domain, iova, + PFN_PHYS(batch->pfns[cur]) + page_offset, + next_iova - iova, area->iommu_prot); + if (rc) + goto err_unmap; + iova = next_iova; + page_offset = 0; + cur++; + } + return 0; +err_unmap: + if (start_iova != iova) + iommu_unmap_nofail(domain, start_iova, iova - start_iova); + return rc; +} + +static void batch_from_xarray(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + rcu_read_lock(); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry)) || + start_index == last_index) + break; + start_index++; + } + rcu_read_unlock(); +} + +static void batch_from_xarray_clear(struct pfn_batch *batch, struct xarray *xa, + unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + while (true) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + if (!batch_add_pfn(batch, xa_to_value(entry))) + break; + xas_store(&xas, NULL); + if (start_index == last_index) + break; + start_index++; + } + xas_unlock(&xas); +} + +static void clear_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index) +{ + XA_STATE(xas, xa, start_index); + void *entry; + + xas_lock(&xas); + xas_for_each(&xas, entry, last_index) + xas_store(&xas, NULL); + xas_unlock(&xas); +} + +static int pages_to_xarray(struct xarray *xa, unsigned long start_index, + unsigned long last_index, struct page **pages) +{ + struct page **end_pages = pages + (last_index - start_index) + 1; + struct page **half_pages = pages + (end_pages - pages) / 2; + XA_STATE(xas, xa, start_index); + + do { + void *old; + + xas_lock(&xas); + while (pages != end_pages) { + /* xarray does not participate in fault injection */ + if (pages == half_pages && iommufd_should_fail()) { + xas_set_err(&xas, -EINVAL); + xas_unlock(&xas); + /* aka xas_destroy() */ + xas_nomem(&xas, GFP_KERNEL); + goto err_clear; + } + + old = xas_store(&xas, xa_mk_value(page_to_pfn(*pages))); + if (xas_error(&xas)) + break; + WARN_ON(old); + pages++; + xas_next(&xas); + } + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + +err_clear: + if (xas_error(&xas)) { + if (xas.xa_index != start_index) + clear_xarray(xa, start_index, xas.xa_index - 1); + return xas_error(&xas); + } + return 0; +} + +static void batch_from_pages(struct pfn_batch *batch, struct page **pages, + size_t npages) +{ + struct page **end = pages + npages; + + for (; pages != end; pages++) + if (!batch_add_pfn(batch, page_to_pfn(*pages))) + break; +} + +static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages, + unsigned int first_page_off, size_t npages) +{ + unsigned int cur = 0; + + while (first_page_off) { + if (batch->npfns[cur] > first_page_off) + break; + first_page_off -= batch->npfns[cur]; + cur++; + } + + while (npages) { + size_t to_unpin = min_t(size_t, npages, + batch->npfns[cur] - first_page_off); + + unpin_user_page_range_dirty_lock( + pfn_to_page(batch->pfns[cur] + first_page_off), + to_unpin, pages->writable); + iopt_pages_sub_npinned(pages, to_unpin); + cur++; + first_page_off = 0; + npages -= to_unpin; + } +} + +static void copy_data_page(struct page *page, void *data, unsigned long offset, + size_t length, unsigned int flags) +{ + void *mem; + + mem = kmap_local_page(page); + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + memcpy(mem + offset, data, length); + set_page_dirty_lock(page); + } else { + memcpy(data, mem + offset, length); + } + kunmap_local(mem); +} + +static unsigned long batch_rw(struct pfn_batch *batch, void *data, + unsigned long offset, unsigned long length, + unsigned int flags) +{ + unsigned long copied = 0; + unsigned int npage = 0; + unsigned int cur = 0; + + while (cur < batch->end) { + unsigned long bytes = min(length, PAGE_SIZE - offset); + + copy_data_page(pfn_to_page(batch->pfns[cur] + npage), data, + offset, bytes, flags); + offset = 0; + length -= bytes; + data += bytes; + copied += bytes; + npage++; + if (npage == batch->npfns[cur]) { + npage = 0; + cur++; + } + if (!length) + break; + } + return copied; +} + +/* pfn_reader_user is just the pin_user_pages() path */ +struct pfn_reader_user { + struct page **upages; + size_t upages_len; + unsigned long upages_start; + unsigned long upages_end; + unsigned int gup_flags; + /* + * 1 means mmget() and mmap_read_lock(), 0 means only mmget(), -1 is + * neither + */ + int locked; +}; + +static void pfn_reader_user_init(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + user->upages = NULL; + user->upages_start = 0; + user->upages_end = 0; + user->locked = -1; + + user->gup_flags = FOLL_LONGTERM; + if (pages->writable) + user->gup_flags |= FOLL_WRITE; +} + +static void pfn_reader_user_destroy(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + if (user->locked != -1) { + if (user->locked) + mmap_read_unlock(pages->source_mm); + if (pages->source_mm != current->mm) + mmput(pages->source_mm); + user->locked = -1; + } + + kfree(user->upages); + user->upages = NULL; +} + +static int pfn_reader_user_pin(struct pfn_reader_user *user, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + bool remote_mm = pages->source_mm != current->mm; + unsigned long npages; + uintptr_t uptr; + long rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + if (!user->upages) { + /* All undone in pfn_reader_destroy() */ + user->upages_len = + (last_index - start_index + 1) * sizeof(*user->upages); + user->upages = temp_kmalloc(&user->upages_len, NULL, 0); + if (!user->upages) + return -ENOMEM; + } + + if (user->locked == -1) { + /* + * The majority of usages will run the map task within the mm + * providing the pages, so we can optimize into + * get_user_pages_fast() + */ + if (remote_mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EFAULT; + } + user->locked = 0; + } + + npages = min_t(unsigned long, last_index - start_index + 1, + user->upages_len / sizeof(*user->upages)); + + + if (iommufd_should_fail()) + return -EFAULT; + + uptr = (uintptr_t)(pages->uptr + start_index * PAGE_SIZE); + if (!remote_mm) + rc = pin_user_pages_fast(uptr, npages, user->gup_flags, + user->upages); + else { + if (!user->locked) { + mmap_read_lock(pages->source_mm); + user->locked = 1; + } + rc = pin_user_pages_remote(pages->source_mm, uptr, npages, + user->gup_flags, user->upages, NULL, + &user->locked); + } + if (rc <= 0) { + if (WARN_ON(!rc)) + return -EFAULT; + return rc; + } + iopt_pages_add_npinned(pages, rc); + user->upages_start = start_index; + user->upages_end = start_index + rc; + return 0; +} + +/* This is the "modern" and faster accounting method used by io_uring */ +static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + unsigned long lock_limit; + unsigned long cur_pages; + unsigned long new_pages; + + lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >> + PAGE_SHIFT; + do { + cur_pages = atomic_long_read(&pages->source_user->locked_vm); + new_pages = cur_pages + npages; + if (new_pages > lock_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages, + new_pages) != cur_pages); + return 0; +} + +static void decr_user_locked_vm(struct iopt_pages *pages, unsigned long npages) +{ + if (WARN_ON(atomic_long_read(&pages->source_user->locked_vm) < npages)) + return; + atomic_long_sub(npages, &pages->source_user->locked_vm); +} + +/* This is the accounting method used for compatibility with VFIO */ +static int update_mm_locked_vm(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + bool do_put = false; + int rc; + + if (user && user->locked) { + mmap_read_unlock(pages->source_mm); + user->locked = 0; + /* If we had the lock then we also have a get */ + } else if ((!user || !user->upages) && + pages->source_mm != current->mm) { + if (!mmget_not_zero(pages->source_mm)) + return -EINVAL; + do_put = true; + } + + mmap_write_lock(pages->source_mm); + rc = __account_locked_vm(pages->source_mm, npages, inc, + pages->source_task, false); + mmap_write_unlock(pages->source_mm); + + if (do_put) + mmput(pages->source_mm); + return rc; +} + +static int do_update_pinned(struct iopt_pages *pages, unsigned long npages, + bool inc, struct pfn_reader_user *user) +{ + int rc = 0; + + switch (pages->account_mode) { + case IOPT_PAGES_ACCOUNT_NONE: + break; + case IOPT_PAGES_ACCOUNT_USER: + if (inc) + rc = incr_user_locked_vm(pages, npages); + else + decr_user_locked_vm(pages, npages); + break; + case IOPT_PAGES_ACCOUNT_MM: + rc = update_mm_locked_vm(pages, npages, inc, user); + break; + } + if (rc) + return rc; + + pages->last_npinned = pages->npinned; + if (inc) + atomic64_add(npages, &pages->source_mm->pinned_vm); + else + atomic64_sub(npages, &pages->source_mm->pinned_vm); + return 0; +} + +static void update_unpinned(struct iopt_pages *pages) +{ + if (WARN_ON(pages->npinned > pages->last_npinned)) + return; + if (pages->npinned == pages->last_npinned) + return; + do_update_pinned(pages, pages->last_npinned - pages->npinned, false, + NULL); +} + +/* + * Changes in the number of pages pinned is done after the pages have been read + * and processed. If the user lacked the limit then the error unwind will unpin + * everything that was just pinned. This is because it is expensive to calculate + * how many pages we have already pinned within a range to generate an accurate + * prediction in advance of doing the work to actually pin them. + */ +static int pfn_reader_user_update_pinned(struct pfn_reader_user *user, + struct iopt_pages *pages) +{ + unsigned long npages; + bool inc; + + lockdep_assert_held(&pages->mutex); + + if (pages->npinned == pages->last_npinned) + return 0; + + if (pages->npinned < pages->last_npinned) { + npages = pages->last_npinned - pages->npinned; + inc = false; + } else { + if (iommufd_should_fail()) + return -ENOMEM; + npages = pages->npinned - pages->last_npinned; + inc = true; + } + return do_update_pinned(pages, npages, inc, user); +} + +/* + * PFNs are stored in three places, in order of preference: + * - The iopt_pages xarray. This is only populated if there is a + * iopt_pages_access + * - The iommu_domain under an area + * - The original PFN source, ie pages->source_mm + * + * This iterator reads the pfns optimizing to load according to the + * above order. + */ +struct pfn_reader { + struct iopt_pages *pages; + struct interval_tree_double_span_iter span; + struct pfn_batch batch; + unsigned long batch_start_index; + unsigned long batch_end_index; + unsigned long last_index; + + struct pfn_reader_user user; +}; + +static int pfn_reader_update_pinned(struct pfn_reader *pfns) +{ + return pfn_reader_user_update_pinned(&pfns->user, pfns->pages); +} + +/* + * The batch can contain a mixture of pages that are still in use and pages that + * need to be unpinned. Unpin only pages that are not held anywhere else. + */ +static void pfn_reader_unpin(struct pfn_reader *pfns) +{ + unsigned long last = pfns->batch_end_index - 1; + unsigned long start = pfns->batch_start_index; + struct interval_tree_double_span_iter span; + struct iopt_pages *pages = pfns->pages; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start, last) { + if (span.is_used) + continue; + + batch_unpin(&pfns->batch, pages, span.start_hole - start, + span.last_hole - span.start_hole + 1); + } +} + +/* Process a single span to load it from the proper storage */ +static int pfn_reader_fill_span(struct pfn_reader *pfns) +{ + struct interval_tree_double_span_iter *span = &pfns->span; + unsigned long start_index = pfns->batch_end_index; + struct iopt_area *area; + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(span->last_used < start_index)) + return -EINVAL; + + if (span->is_used == 1) { + batch_from_xarray(&pfns->batch, &pfns->pages->pinned_pfns, + start_index, span->last_used); + return 0; + } + + if (span->is_used == 2) { + /* + * Pull as many pages from the first domain we find in the + * target span. If it is too small then we will be called again + * and we'll find another area. + */ + area = iopt_pages_find_domain_area(pfns->pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + /* The storage_domain cannot change without the pages mutex */ + batch_from_domain( + &pfns->batch, area->storage_domain, area, start_index, + min(iopt_area_last_index(area), span->last_used)); + return 0; + } + + if (start_index >= pfns->user.upages_end) { + rc = pfn_reader_user_pin(&pfns->user, pfns->pages, start_index, + span->last_hole); + if (rc) + return rc; + } + + batch_from_pages(&pfns->batch, + pfns->user.upages + + (start_index - pfns->user.upages_start), + pfns->user.upages_end - start_index); + return 0; +} + +static bool pfn_reader_done(struct pfn_reader *pfns) +{ + return pfns->batch_start_index == pfns->last_index + 1; +} + +static int pfn_reader_next(struct pfn_reader *pfns) +{ + int rc; + + batch_clear(&pfns->batch); + pfns->batch_start_index = pfns->batch_end_index; + + while (pfns->batch_end_index != pfns->last_index + 1) { + unsigned int npfns = pfns->batch.total_pfns; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(interval_tree_double_span_iter_done(&pfns->span))) + return -EINVAL; + + rc = pfn_reader_fill_span(pfns); + if (rc) + return rc; + + if (WARN_ON(!pfns->batch.total_pfns)) + return -EINVAL; + + pfns->batch_end_index = + pfns->batch_start_index + pfns->batch.total_pfns; + if (pfns->batch_end_index == pfns->span.last_used + 1) + interval_tree_double_span_iter_next(&pfns->span); + + /* Batch is full */ + if (npfns == pfns->batch.total_pfns) + return 0; + } + return 0; +} + +static int pfn_reader_init(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + lockdep_assert_held(&pages->mutex); + + pfns->pages = pages; + pfns->batch_start_index = start_index; + pfns->batch_end_index = start_index; + pfns->last_index = last_index; + pfn_reader_user_init(&pfns->user, pages); + rc = batch_init(&pfns->batch, last_index - start_index + 1); + if (rc) + return rc; + interval_tree_double_span_iter_first(&pfns->span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index); + return 0; +} + +/* + * There are many assertions regarding the state of pages->npinned vs + * pages->last_pinned, for instance something like unmapping a domain must only + * decrement the npinned, and pfn_reader_destroy() must be called only after all + * the pins are updated. This is fine for success flows, but error flows + * sometimes need to release the pins held inside the pfn_reader before going on + * to complete unmapping and releasing pins held in domains. + */ +static void pfn_reader_release_pins(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + if (pfns->user.upages_end > pfns->batch_end_index) { + size_t npages = pfns->user.upages_end - pfns->batch_end_index; + + /* Any pages not transferred to the batch are just unpinned */ + unpin_user_pages(pfns->user.upages + (pfns->batch_end_index - + pfns->user.upages_start), + npages); + iopt_pages_sub_npinned(pages, npages); + pfns->user.upages_end = pfns->batch_end_index; + } + if (pfns->batch_start_index != pfns->batch_end_index) { + pfn_reader_unpin(pfns); + pfns->batch_start_index = pfns->batch_end_index; + } +} + +static void pfn_reader_destroy(struct pfn_reader *pfns) +{ + struct iopt_pages *pages = pfns->pages; + + pfn_reader_release_pins(pfns); + pfn_reader_user_destroy(&pfns->user, pfns->pages); + batch_destroy(&pfns->batch, NULL); + WARN_ON(pages->last_npinned != pages->npinned); +} + +static int pfn_reader_first(struct pfn_reader *pfns, struct iopt_pages *pages, + unsigned long start_index, unsigned long last_index) +{ + int rc; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + WARN_ON(last_index < start_index)) + return -EINVAL; + + rc = pfn_reader_init(pfns, pages, start_index, last_index); + if (rc) + return rc; + rc = pfn_reader_next(pfns); + if (rc) { + pfn_reader_destroy(pfns); + return rc; + } + return 0; +} + +struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, + bool writable) +{ + struct iopt_pages *pages; + + /* + * The iommu API uses size_t as the length, and protect the DIV_ROUND_UP + * below from overflow + */ + if (length > SIZE_MAX - PAGE_SIZE || length == 0) + return ERR_PTR(-EINVAL); + + pages = kzalloc(sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) + return ERR_PTR(-ENOMEM); + + kref_init(&pages->kref); + xa_init_flags(&pages->pinned_pfns, XA_FLAGS_ACCOUNT); + mutex_init(&pages->mutex); + pages->source_mm = current->mm; + mmgrab(pages->source_mm); + pages->uptr = (void __user *)ALIGN_DOWN((uintptr_t)uptr, PAGE_SIZE); + pages->npages = DIV_ROUND_UP(length + (uptr - pages->uptr), PAGE_SIZE); + pages->access_itree = RB_ROOT_CACHED; + pages->domains_itree = RB_ROOT_CACHED; + pages->writable = writable; + if (capable(CAP_IPC_LOCK)) + pages->account_mode = IOPT_PAGES_ACCOUNT_NONE; + else + pages->account_mode = IOPT_PAGES_ACCOUNT_USER; + pages->source_task = current->group_leader; + get_task_struct(current->group_leader); + pages->source_user = get_uid(current_user()); + return pages; +} + +void iopt_release_pages(struct kref *kref) +{ + struct iopt_pages *pages = container_of(kref, struct iopt_pages, kref); + + WARN_ON(!RB_EMPTY_ROOT(&pages->access_itree.rb_root)); + WARN_ON(!RB_EMPTY_ROOT(&pages->domains_itree.rb_root)); + WARN_ON(pages->npinned); + WARN_ON(!xa_empty(&pages->pinned_pfns)); + mmdrop(pages->source_mm); + mutex_destroy(&pages->mutex); + put_task_struct(pages->source_task); + free_uid(pages->source_user); + kfree(pages); +} + +static void +iopt_area_unpin_domain(struct pfn_batch *batch, struct iopt_area *area, + struct iopt_pages *pages, struct iommu_domain *domain, + unsigned long start_index, unsigned long last_index, + unsigned long *unmapped_end_index, + unsigned long real_last_index) +{ + while (start_index <= last_index) { + unsigned long batch_last_index; + + if (*unmapped_end_index <= last_index) { + unsigned long start = + max(start_index, *unmapped_end_index); + + batch_from_domain(batch, domain, area, start, + last_index); + batch_last_index = start + batch->total_pfns - 1; + } else { + batch_last_index = last_index; + } + + /* + * unmaps must always 'cut' at a place where the pfns are not + * contiguous to pair with the maps that always install + * contiguous pages. Thus, if we have to stop unpinning in the + * middle of the domains we need to keep reading pfns until we + * find a cut point to do the unmap. The pfns we read are + * carried over and either skipped or integrated into the next + * batch. + */ + if (batch_last_index == last_index && + last_index != real_last_index) + batch_from_domain_continue(batch, domain, area, + last_index + 1, + real_last_index); + + if (*unmapped_end_index <= batch_last_index) { + iopt_area_unmap_domain_range( + area, domain, *unmapped_end_index, + start_index + batch->total_pfns - 1); + *unmapped_end_index = start_index + batch->total_pfns; + } + + /* unpin must follow unmap */ + batch_unpin(batch, pages, 0, + batch_last_index - start_index + 1); + start_index = batch_last_index + 1; + + batch_clear_carry(batch, + *unmapped_end_index - batch_last_index - 1); + } +} + +static void __iopt_area_unfill_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + unsigned long start_index = iopt_area_index(area); + unsigned long unmapped_end_index = start_index; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + + lockdep_assert_held(&pages->mutex); + + /* + * For security we must not unpin something that is still DMA mapped, + * so this must unmap any IOVA before we go ahead and unpin the pages. + * This creates a complexity where we need to skip over unpinning pages + * held in the xarray, but continue to unmap from the domain. + * + * The domain unmap cannot stop in the middle of a contiguous range of + * PFNs. To solve this problem the unpinning step will read ahead to the + * end of any contiguous span, unmap that whole span, and then only + * unpin the leading part that does not have any accesses. The residual + * PFNs that were unmapped but not unpinned are called a "carry" in the + * batch as they are moved to the front of the PFN list and continue on + * to the next iteration(s). + */ + batch_init_backup(&batch, last_index + 1, backup, sizeof(backup)); + interval_tree_for_each_double_span(&span, &pages->domains_itree, + &pages->access_itree, start_index, + last_index) { + if (span.is_used) { + batch_skip_carry(&batch, + span.last_used - span.start_used + 1); + continue; + } + iopt_area_unpin_domain(&batch, area, pages, domain, + span.start_hole, span.last_hole, + &unmapped_end_index, last_index); + } + /* + * If the range ends in a access then we do the residual unmap without + * any unpins. + */ + if (unmapped_end_index != last_index + 1) + iopt_area_unmap_domain_range(area, domain, unmapped_end_index, + last_index); + WARN_ON(batch.total_pfns); + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +static void iopt_area_unfill_partial_domain(struct iopt_area *area, + struct iopt_pages *pages, + struct iommu_domain *domain, + unsigned long end_index) +{ + if (end_index != iopt_area_index(area)) + __iopt_area_unfill_domain(area, pages, domain, end_index - 1); +} + +/** + * iopt_area_unmap_domain() - Unmap without unpinning PFNs in a domain + * @area: The IOVA range to unmap + * @domain: The domain to unmap + * + * The caller must know that unpinning is not required, usually because there + * are other domains in the iopt. + */ +void iopt_area_unmap_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + iommu_unmap_nofail(domain, iopt_area_iova(area), + iopt_area_length(area)); +} + +/** + * iopt_area_unfill_domain() - Unmap and unpin PFNs in a domain + * @area: IOVA area to use + * @pages: page supplier for the area (area->pages is NULL) + * @domain: Domain to unmap from + * + * The domain should be removed from the domains_itree before calling. The + * domain will always be unmapped, but the PFNs may not be unpinned if there are + * still accesses. + */ +void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, + struct iommu_domain *domain) +{ + __iopt_area_unfill_domain(area, pages, domain, + iopt_area_last_index(area)); +} + +/** + * iopt_area_fill_domain() - Map PFNs from the area into a domain + * @area: IOVA area to use + * @domain: Domain to load PFNs into + * + * Read the pfns from the area's underlying iopt_pages and map them into the + * given domain. Called when attaching a new domain to an io_pagetable. + */ +int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain) +{ + unsigned long done_end_index; + struct pfn_reader pfns; + int rc; + + lockdep_assert_held(&area->pages->mutex); + + rc = pfn_reader_first(&pfns, area->pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + return rc; + + while (!pfn_reader_done(&pfns)) { + done_end_index = pfns.batch_start_index; + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + done_end_index = pfns.batch_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + iopt_area_unfill_partial_domain(area, area->pages, domain, + done_end_index); +out_destroy: + pfn_reader_destroy(&pfns); + return rc; +} + +/** + * iopt_area_fill_domains() - Install PFNs into the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area creation. The area is freshly created and not inserted in + * the domains_itree yet. PFNs are read and loaded into every domain held in the + * area's io_pagetable and the area is installed in the domains_itree. + * + * On failure all domains are left unchanged. + */ +int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + unsigned long done_first_end_index; + unsigned long done_all_end_index; + struct iommu_domain *domain; + unsigned long unmap_index; + struct pfn_reader pfns; + unsigned long index; + int rc; + + lockdep_assert_held(&area->iopt->domains_rwsem); + + if (xa_empty(&area->iopt->domains)) + return 0; + + mutex_lock(&pages->mutex); + rc = pfn_reader_first(&pfns, pages, iopt_area_index(area), + iopt_area_last_index(area)); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + done_first_end_index = pfns.batch_end_index; + done_all_end_index = pfns.batch_start_index; + xa_for_each(&area->iopt->domains, index, domain) { + rc = batch_to_domain(&pfns.batch, domain, area, + pfns.batch_start_index); + if (rc) + goto out_unmap; + } + done_all_end_index = done_first_end_index; + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_unmap; + } + rc = pfn_reader_update_pinned(&pfns); + if (rc) + goto out_unmap; + + area->storage_domain = xa_load(&area->iopt->domains, 0); + interval_tree_insert(&area->pages_node, &pages->domains_itree); + goto out_destroy; + +out_unmap: + pfn_reader_release_pins(&pfns); + xa_for_each(&area->iopt->domains, unmap_index, domain) { + unsigned long end_index; + + if (unmap_index < index) + end_index = done_first_end_index; + else + end_index = done_all_end_index; + + /* + * The area is not yet part of the domains_itree so we have to + * manage the unpinning specially. The last domain does the + * unpin, every other domain is just unmapped. + */ + if (unmap_index != area->iopt->next_domain_id - 1) { + if (end_index != iopt_area_index(area)) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + end_index - 1); + } else { + iopt_area_unfill_partial_domain(area, pages, domain, + end_index); + } + } +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_unfill_domains() - unmap PFNs from the area's domains + * @area: The area to act on + * @pages: The pages associated with the area (area->pages is NULL) + * + * Called during area destruction. This unmaps the iova's covered by all the + * area's domains and releases the PFNs. + */ +void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages) +{ + struct io_pagetable *iopt = area->iopt; + struct iommu_domain *domain; + unsigned long index; + + lockdep_assert_held(&iopt->domains_rwsem); + + mutex_lock(&pages->mutex); + if (!area->storage_domain) + goto out_unlock; + + xa_for_each(&iopt->domains, index, domain) + if (domain != area->storage_domain) + iopt_area_unmap_domain_range( + area, domain, iopt_area_index(area), + iopt_area_last_index(area)); + + interval_tree_remove(&area->pages_node, &pages->domains_itree); + iopt_area_unfill_domain(area, pages, area->storage_domain); + area->storage_domain = NULL; +out_unlock: + mutex_unlock(&pages->mutex); +} + +static void iopt_pages_unpin_xarray(struct pfn_batch *batch, + struct iopt_pages *pages, + unsigned long start_index, + unsigned long end_index) +{ + while (start_index <= end_index) { + batch_from_xarray_clear(batch, &pages->pinned_pfns, start_index, + end_index); + batch_unpin(batch, pages, 0, batch->total_pfns); + start_index += batch->total_pfns; + batch_clear(batch); + } +} + +/** + * iopt_pages_unfill_xarray() - Update the xarry after removing an access + * @pages: The pages to act on + * @start_index: Starting PFN index + * @last_index: Last PFN index + * + * Called when an iopt_pages_access is removed, removes pages from the itree. + * The access should already be removed from the access_itree. + */ +void iopt_pages_unfill_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index) +{ + struct interval_tree_double_span_iter span; + u64 backup[BATCH_BACKUP_SIZE]; + struct pfn_batch batch; + bool batch_inited = false; + + lockdep_assert_held(&pages->mutex); + + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + if (!span.is_used) { + if (!batch_inited) { + batch_init_backup(&batch, + last_index - start_index + 1, + backup, sizeof(backup)); + batch_inited = true; + } + iopt_pages_unpin_xarray(&batch, pages, span.start_hole, + span.last_hole); + } else if (span.is_used == 2) { + /* Covered by a domain */ + clear_xarray(&pages->pinned_pfns, span.start_used, + span.last_used); + } + /* Otherwise covered by an existing access */ + } + if (batch_inited) + batch_destroy(&batch, backup); + update_unpinned(pages); +} + +/** + * iopt_pages_fill_from_xarray() - Fast path for reading PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages + * + * This can be called if the caller is holding a refcount on an + * iopt_pages_access that is known to have already been filled. It quickly reads + * the pages directly from the xarray. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +void iopt_pages_fill_from_xarray(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + XA_STATE(xas, &pages->pinned_pfns, start_index); + void *entry; + + rcu_read_lock(); + while (start_index <= last_index) { + entry = xas_next(&xas); + if (xas_retry(&xas, entry)) + continue; + WARN_ON(!xa_is_value(entry)); + *(out_pages++) = pfn_to_page(xa_to_value(entry)); + start_index++; + } + rcu_read_unlock(); +} + +static int iopt_pages_fill_from_domain(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + while (start_index != last_index + 1) { + unsigned long domain_last; + struct iopt_area *area; + + area = iopt_pages_find_domain_area(pages, start_index); + if (WARN_ON(!area)) + return -EINVAL; + + domain_last = min(iopt_area_last_index(area), last_index); + out_pages = raw_pages_from_domain(area->storage_domain, area, + start_index, domain_last, + out_pages); + start_index = domain_last + 1; + } + return 0; +} + +static int iopt_pages_fill_from_mm(struct iopt_pages *pages, + struct pfn_reader_user *user, + unsigned long start_index, + unsigned long last_index, + struct page **out_pages) +{ + unsigned long cur_index = start_index; + int rc; + + while (cur_index != last_index + 1) { + user->upages = out_pages + (cur_index - start_index); + rc = pfn_reader_user_pin(user, pages, cur_index, last_index); + if (rc) + goto out_unpin; + cur_index = user->upages_end; + } + return 0; + +out_unpin: + if (start_index != cur_index) + iopt_pages_err_unpin(pages, start_index, cur_index - 1, + out_pages); + return rc; +} + +/** + * iopt_pages_fill_xarray() - Read PFNs + * @pages: The pages to act on + * @start_index: The first page index in the range + * @last_index: The last page index in the range + * @out_pages: The output array to return the pages, may be NULL + * + * This populates the xarray and returns the pages in out_pages. As the slow + * path this is able to copy pages from other storage tiers into the xarray. + * + * On failure the xarray is left unchanged. + * + * This is part of the SW iommu interface to read pages for in-kernel use. + */ +int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start_index, + unsigned long last_index, struct page **out_pages) +{ + struct interval_tree_double_span_iter span; + unsigned long xa_end = start_index; + struct pfn_reader_user user; + int rc; + + lockdep_assert_held(&pages->mutex); + + pfn_reader_user_init(&user, pages); + user.upages_len = (last_index - start_index + 1) * sizeof(*out_pages); + interval_tree_for_each_double_span(&span, &pages->access_itree, + &pages->domains_itree, start_index, + last_index) { + struct page **cur_pages; + + if (span.is_used == 1) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_xarray(pages, span.start_used, + span.last_used, cur_pages); + continue; + } + + if (span.is_used == 2) { + cur_pages = out_pages + (span.start_used - start_index); + iopt_pages_fill_from_domain(pages, span.start_used, + span.last_used, cur_pages); + rc = pages_to_xarray(&pages->pinned_pfns, + span.start_used, span.last_used, + cur_pages); + if (rc) + goto out_clean_xa; + xa_end = span.last_used + 1; + continue; + } + + /* hole */ + cur_pages = out_pages + (span.start_hole - start_index); + rc = iopt_pages_fill_from_mm(pages, &user, span.start_hole, + span.last_hole, cur_pages); + if (rc) + goto out_clean_xa; + rc = pages_to_xarray(&pages->pinned_pfns, span.start_hole, + span.last_hole, cur_pages); + if (rc) { + iopt_pages_err_unpin(pages, span.start_hole, + span.last_hole, cur_pages); + goto out_clean_xa; + } + xa_end = span.last_hole + 1; + } + rc = pfn_reader_user_update_pinned(&user, pages); + if (rc) + goto out_clean_xa; + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return 0; + +out_clean_xa: + if (start_index != xa_end) + iopt_pages_unfill_xarray(pages, start_index, xa_end - 1); + user.upages = NULL; + pfn_reader_user_destroy(&user, pages); + return rc; +} + +/* + * This uses the pfn_reader instead of taking a shortcut by using the mm. It can + * do every scenario and is fully consistent with what an iommu_domain would + * see. + */ +static int iopt_pages_rw_slow(struct iopt_pages *pages, + unsigned long start_index, + unsigned long last_index, unsigned long offset, + void *data, unsigned long length, + unsigned int flags) +{ + struct pfn_reader pfns; + int rc; + + mutex_lock(&pages->mutex); + + rc = pfn_reader_first(&pfns, pages, start_index, last_index); + if (rc) + goto out_unlock; + + while (!pfn_reader_done(&pfns)) { + unsigned long done; + + done = batch_rw(&pfns.batch, data, offset, length, flags); + data += done; + length -= done; + offset = 0; + pfn_reader_unpin(&pfns); + + rc = pfn_reader_next(&pfns); + if (rc) + goto out_destroy; + } + if (WARN_ON(length != 0)) + rc = -EINVAL; +out_destroy: + pfn_reader_destroy(&pfns); +out_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/* + * A medium speed path that still allows DMA inconsistencies, but doesn't do any + * memory allocations or interval tree searches. + */ +static int iopt_pages_rw_page(struct iopt_pages *pages, unsigned long index, + unsigned long offset, void *data, + unsigned long length, unsigned int flags) +{ + struct page *page = NULL; + int rc; + + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, index, index, offset, data, + length, flags); + + if (iommufd_should_fail()) { + rc = -EINVAL; + goto out_mmput; + } + + mmap_read_lock(pages->source_mm); + rc = pin_user_pages_remote( + pages->source_mm, (uintptr_t)(pages->uptr + index * PAGE_SIZE), + 1, (flags & IOMMUFD_ACCESS_RW_WRITE) ? FOLL_WRITE : 0, &page, + NULL, NULL); + mmap_read_unlock(pages->source_mm); + if (rc != 1) { + if (WARN_ON(rc >= 0)) + rc = -EINVAL; + goto out_mmput; + } + copy_data_page(page, data, offset, length, flags); + unpin_user_page(page); + rc = 0; + +out_mmput: + mmput(pages->source_mm); + return rc; +} + +/** + * iopt_pages_rw_access - Copy to/from a linear slice of the pages + * @pages: pages to act on + * @start_byte: First byte of pages to copy to/from + * @data: Kernel buffer to get/put the data + * @length: Number of bytes to copy + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * This will find each page in the range, kmap it and then memcpy to/from + * the given kernel buffer. + */ +int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, + void *data, unsigned long length, unsigned int flags) +{ + unsigned long start_index = start_byte / PAGE_SIZE; + unsigned long last_index = (start_byte + length - 1) / PAGE_SIZE; + bool change_mm = current->mm != pages->source_mm; + int rc = 0; + + if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && + (flags & __IOMMUFD_ACCESS_RW_SLOW_PATH)) + change_mm = true; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + if (!(flags & IOMMUFD_ACCESS_RW_KTHREAD) && change_mm) { + if (start_index == last_index) + return iopt_pages_rw_page(pages, start_index, + start_byte % PAGE_SIZE, data, + length, flags); + return iopt_pages_rw_slow(pages, start_index, last_index, + start_byte % PAGE_SIZE, data, length, + flags); + } + + /* + * Try to copy using copy_to_user(). We do this as a fast path and + * ignore any pinning inconsistencies, unlike a real DMA path. + */ + if (change_mm) { + if (!mmget_not_zero(pages->source_mm)) + return iopt_pages_rw_slow(pages, start_index, + last_index, + start_byte % PAGE_SIZE, data, + length, flags); + kthread_use_mm(pages->source_mm); + } + + if (flags & IOMMUFD_ACCESS_RW_WRITE) { + if (copy_to_user(pages->uptr + start_byte, data, length)) + rc = -EFAULT; + } else { + if (copy_from_user(data, pages->uptr + start_byte, length)) + rc = -EFAULT; + } + + if (change_mm) { + kthread_unuse_mm(pages->source_mm); + mmput(pages->source_mm); + } + + return rc; +} + +static struct iopt_pages_access * +iopt_pages_get_exact_access(struct iopt_pages *pages, unsigned long index, + unsigned long last) +{ + struct interval_tree_node *node; + + lockdep_assert_held(&pages->mutex); + + /* There can be overlapping ranges in this interval tree */ + for (node = interval_tree_iter_first(&pages->access_itree, index, last); + node; node = interval_tree_iter_next(node, index, last)) + if (node->start == index && node->last == last) + return container_of(node, struct iopt_pages_access, + node); + return NULL; +} + +/** + * iopt_area_add_access() - Record an in-knerel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * @out_pages: Output list of struct page's representing the PFNs + * @flags: IOMMUFD_ACCESS_RW_* flags + * + * Record that an in-kernel access will be accessing the pages, ensure they are + * pinned, and return the PFNs as a simple list of 'struct page *'. + * + * This should be undone through a matching call to iopt_area_remove_access() + */ +int iopt_area_add_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index, struct page **out_pages, + unsigned int flags) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + int rc; + + if ((flags & IOMMUFD_ACCESS_RW_WRITE) && !pages->writable) + return -EPERM; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (access) { + area->num_accesses++; + access->users++; + iopt_pages_fill_from_xarray(pages, start_index, last_index, + out_pages); + mutex_unlock(&pages->mutex); + return 0; + } + + access = kzalloc(sizeof(*access), GFP_KERNEL_ACCOUNT); + if (!access) { + rc = -ENOMEM; + goto err_unlock; + } + + rc = iopt_pages_fill_xarray(pages, start_index, last_index, out_pages); + if (rc) + goto err_free; + + access->node.start = start_index; + access->node.last = last_index; + access->users = 1; + area->num_accesses++; + interval_tree_insert(&access->node, &pages->access_itree); + mutex_unlock(&pages->mutex); + return 0; + +err_free: + kfree(access); +err_unlock: + mutex_unlock(&pages->mutex); + return rc; +} + +/** + * iopt_area_remove_access() - Release an in-kernel access for PFNs + * @area: The source of PFNs + * @start_index: First page index + * @last_index: Inclusive last page index + * + * Undo iopt_area_add_access() and unpin the pages if necessary. The caller + * must stop using the PFNs before calling this. + */ +void iopt_area_remove_access(struct iopt_area *area, unsigned long start_index, + unsigned long last_index) +{ + struct iopt_pages *pages = area->pages; + struct iopt_pages_access *access; + + mutex_lock(&pages->mutex); + access = iopt_pages_get_exact_access(pages, start_index, last_index); + if (WARN_ON(!access)) + goto out_unlock; + + WARN_ON(area->num_accesses == 0 || access->users == 0); + area->num_accesses--; + access->users--; + if (access->users) + goto out_unlock; + + interval_tree_remove(&access->node, &pages->access_itree); + iopt_pages_unfill_xarray(pages, start_index, last_index); + kfree(access); +out_unlock: + mutex_unlock(&pages->mutex); +} diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c new file mode 100644 index 000000000000..cfb5fe9a5e0e --- /dev/null +++ b/drivers/iommu/iommufd/selftest.c @@ -0,0 +1,853 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * Kernel side components to support tools/testing/selftests/iommu + */ +#include <linux/slab.h> +#include <linux/iommu.h> +#include <linux/xarray.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/fault-inject.h> +#include <uapi/linux/iommufd.h> + +#include "io_pagetable.h" +#include "iommufd_private.h" +#include "iommufd_test.h" + +static DECLARE_FAULT_ATTR(fail_iommufd); +static struct dentry *dbgfs_root; + +size_t iommufd_test_memory_limit = 65536; + +enum { + MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, + + /* + * Like a real page table alignment requires the low bits of the address + * to be zero. xarray also requires the high bit to be zero, so we store + * the pfns shifted. The upper bits are used for metadata. + */ + MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, + + _MOCK_PFN_START = MOCK_PFN_MASK + 1, + MOCK_PFN_START_IOVA = _MOCK_PFN_START, + MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, +}; + +/* + * Syzkaller has trouble randomizing the correct iova to use since it is linked + * to the map ioctl's output, and it has no ide about that. So, simplify things. + * In syzkaller mode the 64 bit IOVA is converted into an nth area and offset + * value. This has a much smaller randomization space and syzkaller can hit it. + */ +static unsigned long iommufd_test_syz_conv_iova(struct io_pagetable *iopt, + u64 *iova) +{ + struct syz_layout { + __u32 nth_area; + __u32 offset; + }; + struct syz_layout *syz = (void *)iova; + unsigned int nth = syz->nth_area; + struct iopt_area *area; + + down_read(&iopt->iova_rwsem); + for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + if (nth == 0) { + up_read(&iopt->iova_rwsem); + return iopt_area_iova(area) + syz->offset; + } + nth--; + } + up_read(&iopt->iova_rwsem); + + return 0; +} + +void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, u64 *iova, u32 *flags) +{ + struct iommufd_ioas *ioas; + + if (!(*flags & MOCK_FLAGS_ACCESS_SYZ)) + return; + *flags &= ~(u32)MOCK_FLAGS_ACCESS_SYZ; + + ioas = iommufd_get_ioas(ucmd, ioas_id); + if (IS_ERR(ioas)) + return; + *iova = iommufd_test_syz_conv_iova(&ioas->iopt, iova); + iommufd_put_object(&ioas->obj); +} + +struct mock_iommu_domain { + struct iommu_domain domain; + struct xarray pfns; +}; + +enum selftest_obj_type { + TYPE_IDEV, +}; + +struct selftest_obj { + struct iommufd_object obj; + enum selftest_obj_type type; + + union { + struct { + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ctx *ictx; + struct device mock_dev; + } idev; + }; +}; + +static struct iommu_domain *mock_domain_alloc(unsigned int iommu_domain_type) +{ + struct mock_iommu_domain *mock; + + if (WARN_ON(iommu_domain_type != IOMMU_DOMAIN_UNMANAGED)) + return NULL; + + mock = kzalloc(sizeof(*mock), GFP_KERNEL); + if (!mock) + return NULL; + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; + mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; + xa_init(&mock->pfns); + return &mock->domain; +} + +static void mock_domain_free(struct iommu_domain *domain) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + + WARN_ON(!xa_empty(&mock->pfns)); + kfree(mock); +} + +static int mock_domain_map_pages(struct iommu_domain *domain, + unsigned long iova, phys_addr_t paddr, + size_t pgsize, size_t pgcount, int prot, + gfp_t gfp, size_t *mapped) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + unsigned long flags = MOCK_PFN_START_IOVA; + unsigned long start_iova = iova; + + /* + * xarray does not reliably work with fault injection because it does a + * retry allocation, so put our own failure point. + */ + if (iommufd_should_fail()) + return -ENOENT; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + void *old; + + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + flags = MOCK_PFN_LAST_IOVA; + old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, + xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | + flags), + gfp); + if (xa_is_err(old)) { + for (; start_iova != iova; + start_iova += MOCK_IO_PAGE_SIZE) + xa_erase(&mock->pfns, + start_iova / + MOCK_IO_PAGE_SIZE); + return xa_err(old); + } + WARN_ON(old); + iova += MOCK_IO_PAGE_SIZE; + paddr += MOCK_IO_PAGE_SIZE; + *mapped += MOCK_IO_PAGE_SIZE; + flags = 0; + } + } + return 0; +} + +static size_t mock_domain_unmap_pages(struct iommu_domain *domain, + unsigned long iova, size_t pgsize, + size_t pgcount, + struct iommu_iotlb_gather *iotlb_gather) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + bool first = true; + size_t ret = 0; + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); + + for (; pgcount; pgcount--) { + size_t cur; + + for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { + ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + /* + * iommufd generates unmaps that must be a strict + * superset of the map's performend So every starting + * IOVA should have been an iova passed to map, and the + * + * First IOVA must be present and have been a first IOVA + * passed to map_pages + */ + if (first) { + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_START_IOVA)); + first = false; + } + if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) + WARN_ON(!(xa_to_value(ent) & + MOCK_PFN_LAST_IOVA)); + + iova += MOCK_IO_PAGE_SIZE; + ret += MOCK_IO_PAGE_SIZE; + } + } + return ret; +} + +static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, + dma_addr_t iova) +{ + struct mock_iommu_domain *mock = + container_of(domain, struct mock_iommu_domain, domain); + void *ent; + + WARN_ON(iova % MOCK_IO_PAGE_SIZE); + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + WARN_ON(!ent); + return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; +} + +static const struct iommu_ops mock_ops = { + .owner = THIS_MODULE, + .pgsize_bitmap = MOCK_IO_PAGE_SIZE, + .domain_alloc = mock_domain_alloc, + .default_domain_ops = + &(struct iommu_domain_ops){ + .free = mock_domain_free, + .map_pages = mock_domain_map_pages, + .unmap_pages = mock_domain_unmap_pages, + .iova_to_phys = mock_domain_iova_to_phys, + }, +}; + +static inline struct iommufd_hw_pagetable * +get_md_pagetable(struct iommufd_ucmd *ucmd, u32 mockpt_id, + struct mock_iommu_domain **mock) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_object *obj; + + obj = iommufd_get_object(ucmd->ictx, mockpt_id, + IOMMUFD_OBJ_HW_PAGETABLE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + hwpt = container_of(obj, struct iommufd_hw_pagetable, obj); + if (hwpt->domain->ops != mock_ops.default_domain_ops) { + iommufd_put_object(&hwpt->obj); + return ERR_PTR(-EINVAL); + } + *mock = container_of(hwpt->domain, struct mock_iommu_domain, domain); + return hwpt; +} + +/* Create an hw_pagetable with the mock domain so we can test the domain ops */ +static int iommufd_test_mock_domain(struct iommufd_ucmd *ucmd, + struct iommu_test_cmd *cmd) +{ + static struct bus_type mock_bus = { .iommu_ops = &mock_ops }; + struct iommufd_hw_pagetable *hwpt; + struct selftest_obj *sobj; + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, cmd->id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + sobj = iommufd_object_alloc(ucmd->ictx, sobj, IOMMUFD_OBJ_SELFTEST); + if (IS_ERR(sobj)) { + rc = PTR_ERR(sobj); + goto out_ioas; + } + sobj->idev.ictx = ucmd->ictx; + sobj->type = TYPE_IDEV; + sobj->idev.mock_dev.bus = &mock_bus; + + hwpt = iommufd_device_selftest_attach(ucmd->ictx, ioas, + &sobj->idev.mock_dev); + if (IS_ERR(hwpt)) { + rc = PTR_ERR(hwpt); + goto out_sobj; + } + sobj->idev.hwpt = hwpt; + + /* Userspace must destroy both of these IDs to destroy the object */ + cmd->mock_domain.out_hwpt_id = hwpt->obj.id; + cmd->mock_domain.out_device_id = sobj->obj.id; + iommufd_object_finalize(ucmd->ictx, &sobj->obj); + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + +out_sobj: + iommufd_object_abort(ucmd->ictx, &sobj->obj); +out_ioas: + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Add an additional reserved IOVA to the IOAS */ +static int iommufd_test_add_reserved(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, + unsigned long start, size_t length) +{ + struct iommufd_ioas *ioas; + int rc; + + ioas = iommufd_get_ioas(ucmd, mockpt_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + down_write(&ioas->iopt.iova_rwsem); + rc = iopt_reserve_iova(&ioas->iopt, start, start + length - 1, NULL); + up_write(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +/* Check that every pfn under each iova matches the pfn under a user VA */ +static int iommufd_test_md_check_pa(struct iommufd_ucmd *ucmd, + unsigned int mockpt_id, unsigned long iova, + size_t length, void __user *uptr) +{ + struct iommufd_hw_pagetable *hwpt; + struct mock_iommu_domain *mock; + int rc; + + if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || + (uintptr_t)uptr % MOCK_IO_PAGE_SIZE) + return -EINVAL; + + hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); + if (IS_ERR(hwpt)) + return PTR_ERR(hwpt); + + for (; length; length -= MOCK_IO_PAGE_SIZE) { + struct page *pages[1]; + unsigned long pfn; + long npages; + void *ent; + + npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, + pages); + if (npages < 0) { + rc = npages; + goto out_put; + } + if (WARN_ON(npages != 1)) { + rc = -EFAULT; + goto out_put; + } + pfn = page_to_pfn(pages[0]); + put_page(pages[0]); + + ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); + if (!ent || + (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { + rc = -EINVAL; + goto out_put; + } + iova += MOCK_IO_PAGE_SIZE; + uptr += MOCK_IO_PAGE_SIZE; + } + rc = 0; + +out_put: + iommufd_put_object(&hwpt->obj); + return rc; +} + +/* Check that the page ref count matches, to look for missing pin/unpins */ +static int iommufd_test_md_check_refs(struct iommufd_ucmd *ucmd, + void __user *uptr, size_t length, + unsigned int refs) +{ + if (length % PAGE_SIZE || (uintptr_t)uptr % PAGE_SIZE) + return -EINVAL; + + for (; length; length -= PAGE_SIZE) { + struct page *pages[1]; + long npages; + + npages = get_user_pages_fast((uintptr_t)uptr, 1, 0, pages); + if (npages < 0) + return npages; + if (WARN_ON(npages != 1)) + return -EFAULT; + if (!PageCompound(pages[0])) { + unsigned int count; + + count = page_ref_count(pages[0]); + if (count / GUP_PIN_COUNTING_BIAS != refs) { + put_page(pages[0]); + return -EIO; + } + } + put_page(pages[0]); + uptr += PAGE_SIZE; + } + return 0; +} + +struct selftest_access { + struct iommufd_access *access; + struct file *file; + struct mutex lock; + struct list_head items; + unsigned int next_id; + bool destroying; +}; + +struct selftest_access_item { + struct list_head items_elm; + unsigned long iova; + size_t length; + unsigned int id; +}; + +static const struct file_operations iommfd_test_staccess_fops; + +static struct selftest_access *iommufd_access_get(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADFD); + + if (file->f_op != &iommfd_test_staccess_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + return file->private_data; +} + +static void iommufd_test_access_unmap(void *data, unsigned long iova, + unsigned long length) +{ + unsigned long iova_last = iova + length - 1; + struct selftest_access *staccess = data; + struct selftest_access_item *item; + struct selftest_access_item *tmp; + + mutex_lock(&staccess->lock); + list_for_each_entry_safe(item, tmp, &staccess->items, items_elm) { + if (iova > item->iova + item->length - 1 || + iova_last < item->iova) + continue; + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + kfree(item); + } + mutex_unlock(&staccess->lock); +} + +static int iommufd_test_access_item_destroy(struct iommufd_ucmd *ucmd, + unsigned int access_id, + unsigned int item_id) +{ + struct selftest_access_item *item; + struct selftest_access *staccess; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + mutex_lock(&staccess->lock); + list_for_each_entry(item, &staccess->items, items_elm) { + if (item->id == item_id) { + list_del(&item->items_elm); + iommufd_access_unpin_pages(staccess->access, item->iova, + item->length); + mutex_unlock(&staccess->lock); + kfree(item); + fput(staccess->file); + return 0; + } + } + mutex_unlock(&staccess->lock); + fput(staccess->file); + return -ENOENT; +} + +static int iommufd_test_staccess_release(struct inode *inode, + struct file *filep) +{ + struct selftest_access *staccess = filep->private_data; + + if (staccess->access) { + iommufd_test_access_unmap(staccess, 0, ULONG_MAX); + iommufd_access_destroy(staccess->access); + } + mutex_destroy(&staccess->lock); + kfree(staccess); + return 0; +} + +static const struct iommufd_access_ops selftest_access_ops_pin = { + .needs_pin_pages = 1, + .unmap = iommufd_test_access_unmap, +}; + +static const struct iommufd_access_ops selftest_access_ops = { + .unmap = iommufd_test_access_unmap, +}; + +static const struct file_operations iommfd_test_staccess_fops = { + .release = iommufd_test_staccess_release, +}; + +static struct selftest_access *iommufd_test_alloc_access(void) +{ + struct selftest_access *staccess; + struct file *filep; + + staccess = kzalloc(sizeof(*staccess), GFP_KERNEL_ACCOUNT); + if (!staccess) + return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&staccess->items); + mutex_init(&staccess->lock); + + filep = anon_inode_getfile("[iommufd_test_staccess]", + &iommfd_test_staccess_fops, staccess, + O_RDWR); + if (IS_ERR(filep)) { + kfree(staccess); + return ERR_CAST(filep); + } + staccess->file = filep; + return staccess; +} + +static int iommufd_test_create_access(struct iommufd_ucmd *ucmd, + unsigned int ioas_id, unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + struct iommufd_access *access; + int fdno; + int rc; + + if (flags & ~MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) + return -EOPNOTSUPP; + + staccess = iommufd_test_alloc_access(); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + rc = -ENOMEM; + goto out_free_staccess; + } + + access = iommufd_access_create( + ucmd->ictx, ioas_id, + (flags & MOCK_FLAGS_ACCESS_CREATE_NEEDS_PIN_PAGES) ? + &selftest_access_ops_pin : + &selftest_access_ops, + staccess); + if (IS_ERR(access)) { + rc = PTR_ERR(access); + goto out_put_fdno; + } + cmd->create_access.out_access_fd = fdno; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_destroy; + + staccess->access = access; + fd_install(fdno, staccess->file); + return 0; + +out_destroy: + iommufd_access_destroy(access); +out_put_fdno: + put_unused_fd(fdno); +out_free_staccess: + fput(staccess->file); + return rc; +} + +/* Check that the pages in a page array match the pages in the user VA */ +static int iommufd_test_check_pages(void __user *uptr, struct page **pages, + size_t npages) +{ + for (; npages; npages--) { + struct page *tmp_pages[1]; + long rc; + + rc = get_user_pages_fast((uintptr_t)uptr, 1, 0, tmp_pages); + if (rc < 0) + return rc; + if (WARN_ON(rc != 1)) + return -EFAULT; + put_page(tmp_pages[0]); + if (tmp_pages[0] != *pages) + return -EBADE; + pages++; + uptr += PAGE_SIZE; + } + return 0; +} + +static int iommufd_test_access_pages(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *uptr, + u32 flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access_item *item; + struct selftest_access *staccess; + struct page **pages; + size_t npages; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_FLAGS_ACCESS_WRITE | MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + if (staccess->access->ops != &selftest_access_ops_pin) { + rc = -EOPNOTSUPP; + goto out_put; + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_pages.iova); + + npages = (ALIGN(iova + length, PAGE_SIZE) - + ALIGN_DOWN(iova, PAGE_SIZE)) / + PAGE_SIZE; + pages = kvcalloc(npages, sizeof(*pages), GFP_KERNEL_ACCOUNT); + if (!pages) { + rc = -ENOMEM; + goto out_put; + } + + /* + * Drivers will need to think very carefully about this locking. The + * core code can do multiple unmaps instantaneously after + * iommufd_access_pin_pages() and *all* the unmaps must not return until + * the range is unpinned. This simple implementation puts a global lock + * around the pin, which may not suit drivers that want this to be a + * performance path. drivers that get this wrong will trigger WARN_ON + * races and cause EDEADLOCK failures to userspace. + */ + mutex_lock(&staccess->lock); + rc = iommufd_access_pin_pages(staccess->access, iova, length, pages, + flags & MOCK_FLAGS_ACCESS_WRITE); + if (rc) + goto out_unlock; + + /* For syzkaller allow uptr to be NULL to skip this check */ + if (uptr) { + rc = iommufd_test_check_pages( + uptr - (iova - ALIGN_DOWN(iova, PAGE_SIZE)), pages, + npages); + if (rc) + goto out_unaccess; + } + + item = kzalloc(sizeof(*item), GFP_KERNEL_ACCOUNT); + if (!item) { + rc = -ENOMEM; + goto out_unaccess; + } + + item->iova = iova; + item->length = length; + item->id = staccess->next_id++; + list_add_tail(&item->items_elm, &staccess->items); + + cmd->access_pages.out_access_pages_id = item->id; + rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + if (rc) + goto out_free_item; + goto out_unlock; + +out_free_item: + list_del(&item->items_elm); + kfree(item); +out_unaccess: + iommufd_access_unpin_pages(staccess->access, iova, length); +out_unlock: + mutex_unlock(&staccess->lock); + kvfree(pages); +out_put: + fput(staccess->file); + return rc; +} + +static int iommufd_test_access_rw(struct iommufd_ucmd *ucmd, + unsigned int access_id, unsigned long iova, + size_t length, void __user *ubuf, + unsigned int flags) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + struct selftest_access *staccess; + void *tmp; + int rc; + + /* Prevent syzkaller from triggering a WARN_ON in kvzalloc() */ + if (length > 16*1024*1024) + return -ENOMEM; + + if (flags & ~(MOCK_ACCESS_RW_WRITE | MOCK_ACCESS_RW_SLOW_PATH | + MOCK_FLAGS_ACCESS_SYZ)) + return -EOPNOTSUPP; + + staccess = iommufd_access_get(access_id); + if (IS_ERR(staccess)) + return PTR_ERR(staccess); + + tmp = kvzalloc(length, GFP_KERNEL_ACCOUNT); + if (!tmp) { + rc = -ENOMEM; + goto out_put; + } + + if (flags & MOCK_ACCESS_RW_WRITE) { + if (copy_from_user(tmp, ubuf, length)) { + rc = -EFAULT; + goto out_free; + } + } + + if (flags & MOCK_FLAGS_ACCESS_SYZ) + iova = iommufd_test_syz_conv_iova(&staccess->access->ioas->iopt, + &cmd->access_rw.iova); + + rc = iommufd_access_rw(staccess->access, iova, tmp, length, flags); + if (rc) + goto out_free; + if (!(flags & MOCK_ACCESS_RW_WRITE)) { + if (copy_to_user(ubuf, tmp, length)) { + rc = -EFAULT; + goto out_free; + } + } + +out_free: + kvfree(tmp); +out_put: + fput(staccess->file); + return rc; +} +static_assert((unsigned int)MOCK_ACCESS_RW_WRITE == IOMMUFD_ACCESS_RW_WRITE); +static_assert((unsigned int)MOCK_ACCESS_RW_SLOW_PATH == + __IOMMUFD_ACCESS_RW_SLOW_PATH); + +void iommufd_selftest_destroy(struct iommufd_object *obj) +{ + struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj); + + switch (sobj->type) { + case TYPE_IDEV: + iommufd_device_selftest_detach(sobj->idev.ictx, + sobj->idev.hwpt); + break; + } +} + +int iommufd_test(struct iommufd_ucmd *ucmd) +{ + struct iommu_test_cmd *cmd = ucmd->cmd; + + switch (cmd->op) { + case IOMMU_TEST_OP_ADD_RESERVED: + return iommufd_test_add_reserved(ucmd, cmd->id, + cmd->add_reserved.start, + cmd->add_reserved.length); + case IOMMU_TEST_OP_MOCK_DOMAIN: + return iommufd_test_mock_domain(ucmd, cmd); + case IOMMU_TEST_OP_MD_CHECK_MAP: + return iommufd_test_md_check_pa( + ucmd, cmd->id, cmd->check_map.iova, + cmd->check_map.length, + u64_to_user_ptr(cmd->check_map.uptr)); + case IOMMU_TEST_OP_MD_CHECK_REFS: + return iommufd_test_md_check_refs( + ucmd, u64_to_user_ptr(cmd->check_refs.uptr), + cmd->check_refs.length, cmd->check_refs.refs); + case IOMMU_TEST_OP_CREATE_ACCESS: + return iommufd_test_create_access(ucmd, cmd->id, + cmd->create_access.flags); + case IOMMU_TEST_OP_ACCESS_PAGES: + return iommufd_test_access_pages( + ucmd, cmd->id, cmd->access_pages.iova, + cmd->access_pages.length, + u64_to_user_ptr(cmd->access_pages.uptr), + cmd->access_pages.flags); + case IOMMU_TEST_OP_ACCESS_RW: + return iommufd_test_access_rw( + ucmd, cmd->id, cmd->access_rw.iova, + cmd->access_rw.length, + u64_to_user_ptr(cmd->access_rw.uptr), + cmd->access_rw.flags); + case IOMMU_TEST_OP_DESTROY_ACCESS_PAGES: + return iommufd_test_access_item_destroy( + ucmd, cmd->id, cmd->destroy_access_pages.access_pages_id); + case IOMMU_TEST_OP_SET_TEMP_MEMORY_LIMIT: + /* Protect _batch_init(), can not be less than elmsz */ + if (cmd->memory_limit.limit < + sizeof(unsigned long) + sizeof(u32)) + return -EINVAL; + iommufd_test_memory_limit = cmd->memory_limit.limit; + return 0; + default: + return -EOPNOTSUPP; + } +} + +bool iommufd_should_fail(void) +{ + return should_fail(&fail_iommufd, 1); +} + +void __init iommufd_test_init(void) +{ + dbgfs_root = + fault_create_debugfs_attr("fail_iommufd", NULL, &fail_iommufd); +} + +void iommufd_test_exit(void) +{ + debugfs_remove_recursive(dbgfs_root); +} diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c new file mode 100644 index 000000000000..3ceca0e8311c --- /dev/null +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/file.h> +#include <linux/interval_tree.h> +#include <linux/iommu.h> +#include <linux/iommufd.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <uapi/linux/vfio.h> +#include <uapi/linux/iommufd.h> + +#include "iommufd_private.h" + +static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx) +{ + struct iommufd_ioas *ioas = ERR_PTR(-ENODEV); + + xa_lock(&ictx->objects); + if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj)) + goto out_unlock; + ioas = ictx->vfio_ioas; +out_unlock: + xa_unlock(&ictx->objects); + return ioas; +} + +/** + * iommufd_vfio_compat_ioas_id - Return the IOAS ID that vfio should use + * @ictx: Context to operate on + * @out_ioas_id: The ioas_id the caller should use + * + * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate + * on since they do not have an IOAS ID input in their ABI. Only attaching a + * group should cause a default creation of the internal ioas, this returns the + * existing ioas if it has already been assigned somehow. + */ +int iommufd_vfio_compat_ioas_id(struct iommufd_ctx *ictx, u32 *out_ioas_id) +{ + struct iommufd_ioas *ioas = NULL; + struct iommufd_ioas *out_ioas; + + ioas = iommufd_ioas_alloc(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + xa_lock(&ictx->objects); + if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) + out_ioas = ictx->vfio_ioas; + else { + out_ioas = ioas; + ictx->vfio_ioas = ioas; + } + xa_unlock(&ictx->objects); + + *out_ioas_id = out_ioas->obj.id; + if (out_ioas != ioas) { + iommufd_put_object(&out_ioas->obj); + iommufd_object_abort(ictx, &ioas->obj); + return 0; + } + /* + * An automatically created compat IOAS is treated as a userspace + * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET, + * and if not manually destroyed it will be destroyed automatically + * at iommufd release. + */ + iommufd_object_finalize(ictx, &ioas->obj); + return 0; +} +EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_id, IOMMUFD_VFIO); + +int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd) +{ + struct iommu_vfio_ioas *cmd = ucmd->cmd; + struct iommufd_ioas *ioas; + + if (cmd->__reserved) + return -EOPNOTSUPP; + switch (cmd->op) { + case IOMMU_VFIO_IOAS_GET: + ioas = get_compat_ioas(ucmd->ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + cmd->ioas_id = ioas->obj.id; + iommufd_put_object(&ioas->obj); + return iommufd_ucmd_respond(ucmd, sizeof(*cmd)); + + case IOMMU_VFIO_IOAS_SET: + ioas = iommufd_get_ioas(ucmd, cmd->ioas_id); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = ioas; + xa_unlock(&ucmd->ictx->objects); + iommufd_put_object(&ioas->obj); + return 0; + + case IOMMU_VFIO_IOAS_CLEAR: + xa_lock(&ucmd->ictx->objects); + ucmd->ictx->vfio_ioas = NULL; + xa_unlock(&ucmd->ictx->objects); + return 0; + default: + return -EOPNOTSUPP; + } +} + +static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size); + struct vfio_iommu_type1_dma_map map; + int iommu_prot = IOMMU_CACHE; + struct iommufd_ioas *ioas; + unsigned long iova; + int rc; + + if (copy_from_user(&map, arg, minsz)) + return -EFAULT; + + if (map.argsz < minsz || map.flags & ~supported_flags) + return -EINVAL; + + if (map.flags & VFIO_DMA_MAP_FLAG_READ) + iommu_prot |= IOMMU_READ; + if (map.flags & VFIO_DMA_MAP_FLAG_WRITE) + iommu_prot |= IOMMU_WRITE; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * Maps created through the legacy interface always use VFIO compatible + * rlimit accounting. If the user wishes to use the faster user based + * rlimit accounting then they must use the new interface. + */ + iova = map.iova; + rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr), + map.size, iommu_prot, 0); + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd, + void __user *arg) +{ + size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); + /* + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new + * dirty tracking direction: + * https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/ + * https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/ + */ + u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL; + struct vfio_iommu_type1_dma_unmap unmap; + unsigned long unmapped = 0; + struct iommufd_ioas *ioas; + int rc; + + if (copy_from_user(&unmap, arg, minsz)) + return -EFAULT; + + if (unmap.argsz < minsz || unmap.flags & ~supported_flags) + return -EINVAL; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) { + if (unmap.iova != 0 || unmap.size != 0) { + rc = -EINVAL; + goto err_put; + } + rc = iopt_unmap_all(&ioas->iopt, &unmapped); + } else { + if (READ_ONCE(ioas->iopt.disable_large_pages)) { + /* + * Create cuts at the start and last of the requested + * range. If the start IOVA is 0 then it doesn't need to + * be cut. + */ + unsigned long iovas[] = { unmap.iova + unmap.size - 1, + unmap.iova - 1 }; + + rc = iopt_cut_iova(&ioas->iopt, iovas, + unmap.iova ? 2 : 1); + if (rc) + goto err_put; + } + rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size, + &unmapped); + } + unmap.size = unmapped; + if (copy_to_user(arg, &unmap, minsz)) + rc = -EFAULT; + +err_put: + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx) +{ + struct iommufd_hw_pagetable *hwpt; + struct iommufd_ioas *ioas; + int rc = 1; + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + mutex_lock(&ioas->mutex); + list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) { + if (!hwpt->enforce_cache_coherency) { + rc = 0; + break; + } + } + mutex_unlock(&ioas->mutex); + + iommufd_put_object(&ioas->obj); + return rc; +} + +static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx, + unsigned long type) +{ + switch (type) { + case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: + case VFIO_UNMAP_ALL: + return 1; + + case VFIO_DMA_CC_IOMMU: + return iommufd_vfio_cc_iommu(ictx); + + /* + * This is obsolete, and to be removed from VFIO. It was an incomplete + * idea that got merged. + * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/ + */ + case VFIO_TYPE1_NESTING_IOMMU: + return 0; + + /* + * VFIO_DMA_MAP_FLAG_VADDR + * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/ + * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/ + * + * It is hard to see how this could be implemented safely. + */ + case VFIO_UPDATE_VADDR: + default: + return 0; + } +} + +static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type) +{ + struct iommufd_ioas *ioas = NULL; + int rc = 0; + + if (type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) + return -EINVAL; + + /* VFIO fails the set_iommu if there is no group */ + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + /* + * The difference between TYPE1 and TYPE1v2 is the ability to unmap in + * the middle of mapped ranges. This is complicated by huge page support + * which creates single large IOPTEs that cannot be split by the iommu + * driver. TYPE1 is very old at this point and likely nothing uses it, + * however it is simple enough to emulate by simply disabling the + * problematic large IOPTEs. Then we can safely unmap within any range. + */ + if (type == VFIO_TYPE1_IOMMU) + rc = iopt_disable_large_pages(&ioas->iopt); + iommufd_put_object(&ioas->obj); + return rc; +} + +static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas) +{ + struct io_pagetable *iopt = &ioas->iopt; + unsigned long pgsize_bitmap = ULONG_MAX; + struct iommu_domain *domain; + unsigned long index; + + down_read(&iopt->domains_rwsem); + xa_for_each(&iopt->domains, index, domain) + pgsize_bitmap &= domain->pgsize_bitmap; + + /* See vfio_update_pgsize_bitmap() */ + if (pgsize_bitmap & ~PAGE_MASK) { + pgsize_bitmap &= PAGE_MASK; + pgsize_bitmap |= PAGE_SIZE; + } + pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment); + up_read(&iopt->domains_rwsem); + return pgsize_bitmap; +} + +static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas = + container_of(cur, + struct vfio_iommu_type1_info_cap_iova_range __user, + header); + struct vfio_iommu_type1_info_cap_iova_range cap_iovas = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, + .version = 1, + }, + }; + struct interval_tree_span_iter span; + + interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0, + ULONG_MAX) { + struct vfio_iova_range range; + + if (!span.is_hole) + continue; + range.start = span.start_hole; + range.end = span.last_hole; + if (avail >= struct_size(&cap_iovas, iova_ranges, + cap_iovas.nr_iovas + 1) && + copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas], + &range, sizeof(range))) + return -EFAULT; + cap_iovas.nr_iovas++; + } + if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) && + copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas))) + return -EFAULT; + return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas); +} + +static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail) +{ + struct vfio_iommu_type1_info_dma_avail cap_dma = { + .header = { + .id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL, + .version = 1, + }, + /* + * iommufd's limit is based on the cgroup's memory limit. + * Normally vfio would return U16_MAX here, and provide a module + * parameter to adjust it. Since S390 qemu userspace actually + * pays attention and needs a value bigger than U16_MAX return + * U32_MAX. + */ + .avail = U32_MAX, + }; + + if (avail >= sizeof(cap_dma) && + copy_to_user(cur, &cap_dma, sizeof(cap_dma))) + return -EFAULT; + return sizeof(cap_dma); +} + +static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, + void __user *arg) +{ + typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas, + struct vfio_info_cap_header __user *cur, + size_t avail); + static const fill_cap_fn fill_fns[] = { + iommufd_fill_cap_dma_avail, + iommufd_fill_cap_iova, + }; + size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); + struct vfio_info_cap_header __user *last_cap = NULL; + struct vfio_iommu_type1_info info; + struct iommufd_ioas *ioas; + size_t total_cap_size; + int rc; + int i; + + if (copy_from_user(&info, arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + minsz = min_t(size_t, info.argsz, sizeof(info)); + + ioas = get_compat_ioas(ictx); + if (IS_ERR(ioas)) + return PTR_ERR(ioas); + + info.flags = VFIO_IOMMU_INFO_PGSIZES; + info.iova_pgsizes = iommufd_get_pagesizes(ioas); + info.cap_offset = 0; + + down_read(&ioas->iopt.iova_rwsem); + total_cap_size = sizeof(info); + for (i = 0; i != ARRAY_SIZE(fill_fns); i++) { + int cap_size; + + if (info.argsz > total_cap_size) + cap_size = fill_fns[i](ioas, arg + total_cap_size, + info.argsz - total_cap_size); + else + cap_size = fill_fns[i](ioas, NULL, 0); + if (cap_size < 0) { + rc = cap_size; + goto out_put; + } + if (last_cap && info.argsz >= total_cap_size && + put_user(total_cap_size, &last_cap->next)) { + rc = -EFAULT; + goto out_put; + } + last_cap = arg + total_cap_size; + total_cap_size += cap_size; + } + + /* + * If the user did not provide enough space then only some caps are + * returned and the argsz will be updated to the correct amount to get + * all caps. + */ + if (info.argsz >= total_cap_size) + info.cap_offset = sizeof(info); + info.argsz = total_cap_size; + info.flags |= VFIO_IOMMU_INFO_CAPS; + if (copy_to_user(arg, &info, minsz)) { + rc = -EFAULT; + goto out_put; + } + rc = 0; + +out_put: + up_read(&ioas->iopt.iova_rwsem); + iommufd_put_object(&ioas->obj); + return rc; +} + +int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, + unsigned long arg) +{ + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GET_API_VERSION: + return VFIO_API_VERSION; + case VFIO_SET_IOMMU: + return iommufd_vfio_set_iommu(ictx, arg); + case VFIO_CHECK_EXTENSION: + return iommufd_vfio_check_extension(ictx, arg); + case VFIO_IOMMU_GET_INFO: + return iommufd_vfio_iommu_get_info(ictx, uarg); + case VFIO_IOMMU_MAP_DMA: + return iommufd_vfio_map_dma(ictx, cmd, uarg); + case VFIO_IOMMU_UNMAP_DMA: + return iommufd_vfio_unmap_dma(ictx, cmd, uarg); + case VFIO_IOMMU_DIRTY_PAGES: + default: + return -ENOIOCTLCMD; + } + return -ENOIOCTLCMD; +} diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c index 3b30c0752274..22230cc15dcd 100644 --- a/drivers/iommu/ipmmu-vmsa.c +++ b/drivers/iommu/ipmmu-vmsa.c @@ -628,8 +628,6 @@ static int ipmmu_attach_device(struct iommu_domain *io_domain, * Something is wrong, we can't attach two devices using * different IOMMUs to the same domain. */ - dev_err(dev, "Can't attach IPMMU %s to domain on IPMMU %s\n", - dev_name(mmu->dev), dev_name(domain->mmu->dev)); ret = -EINVAL; } else dev_info(dev, "Reusing IPMMU context %u\n", domain->context_id); diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c index 2ab2ecfe01f8..b383c8327f9c 100644 --- a/drivers/iommu/mtk_iommu.c +++ b/drivers/iommu/mtk_iommu.c @@ -609,7 +609,7 @@ static int mtk_iommu_domain_finalise(struct mtk_iommu_domain *dom, dom->iop = alloc_io_pgtable_ops(ARM_V7S, &dom->cfg, data); if (!dom->iop) { dev_err(data->dev, "Failed to alloc io pgtable\n"); - return -EINVAL; + return -ENOMEM; } /* Update our support page sizes bitmap */ @@ -668,7 +668,7 @@ static int mtk_iommu_attach_device(struct iommu_domain *domain, ret = mtk_iommu_domain_finalise(dom, frstdata, region_id); if (ret) { mutex_unlock(&dom->mutex); - return -ENODEV; + return ret; } dom->bank = &data->bank[bankid]; } diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 07ee2600113c..2fd7702c6709 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1414,7 +1414,7 @@ static int omap_iommu_attach_init(struct device *dev, odomain->num_iommus = omap_iommu_count(dev); if (!odomain->num_iommus) - return -EINVAL; + return -ENODEV; odomain->iommus = kcalloc(odomain->num_iommus, sizeof(*iommu), GFP_ATOMIC); @@ -1464,7 +1464,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) if (!arch_data || !arch_data->iommu_dev) { dev_err(dev, "device doesn't have an associated iommu\n"); - return -EINVAL; + return -ENODEV; } spin_lock(&omap_domain->lock); @@ -1472,7 +1472,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) /* only a single client device can be attached to a domain */ if (omap_domain->dev) { dev_err(dev, "iommu domain is already attached\n"); - ret = -EBUSY; + ret = -EINVAL; goto out; } diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c index fadd2c907222..e02793375598 100644 --- a/drivers/iommu/sprd-iommu.c +++ b/drivers/iommu/sprd-iommu.c @@ -237,10 +237,8 @@ static int sprd_iommu_attach_device(struct iommu_domain *domain, struct sprd_iommu_domain *dom = to_sprd_domain(domain); size_t pgt_size = sprd_iommu_pgt_size(domain); - if (dom->sdev) { - pr_err("There's already a device attached to this domain.\n"); + if (dom->sdev) return -EINVAL; - } dom->pgt_va = dma_alloc_coherent(sdev->dev, pgt_size, &dom->pgt_pa, GFP_KERNEL); if (!dom->pgt_va) diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c index e5ca3cf1a949..ed53279d1106 100644 --- a/drivers/iommu/tegra-gart.c +++ b/drivers/iommu/tegra-gart.c @@ -112,7 +112,7 @@ static int gart_iommu_attach_dev(struct iommu_domain *domain, spin_lock(&gart->dom_lock); if (gart->active_domain && gart->active_domain != domain) { - ret = -EBUSY; + ret = -EINVAL; } else if (dev_iommu_priv_get(dev) != domain) { dev_iommu_priv_set(dev, domain); gart->active_domain = domain; diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c index 8b1b5c270e50..5b8fe9bfa9a5 100644 --- a/drivers/iommu/virtio-iommu.c +++ b/drivers/iommu/virtio-iommu.c @@ -670,7 +670,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, dev_err(vdev->dev, "granule 0x%lx larger than system page size 0x%lx\n", viommu_page_size, PAGE_SIZE); - return -EINVAL; + return -ENODEV; } ret = ida_alloc_range(&viommu->domain_ids, viommu->first_domain, @@ -697,7 +697,7 @@ static int viommu_domain_finalise(struct viommu_endpoint *vdev, if (ret) { ida_free(&viommu->domain_ids, vdomain->id); vdomain->viommu = NULL; - return -EOPNOTSUPP; + return ret; } } @@ -734,8 +734,7 @@ static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) */ ret = viommu_domain_finalise(vdev, domain); } else if (vdomain->viommu != vdev->viommu) { - dev_err(dev, "cannot attach to foreign vIOMMU\n"); - ret = -EXDEV; + ret = -EINVAL; } mutex_unlock(&vdomain->mutex); diff --git a/drivers/misc/uacce/uacce.c b/drivers/misc/uacce/uacce.c index b70a013139c7..905eff1f840e 100644 --- a/drivers/misc/uacce/uacce.c +++ b/drivers/misc/uacce/uacce.c @@ -108,7 +108,7 @@ static int uacce_bind_queue(struct uacce_device *uacce, struct uacce_queue *q) if (!(uacce->flags & UACCE_DEV_SVA)) return 0; - handle = iommu_sva_bind_device(uacce->parent, current->mm, NULL); + handle = iommu_sva_bind_device(uacce->parent, current->mm); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index c967ad6e2626..f9cc2e10b676 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -382,6 +382,9 @@ int pci_enable_pasid(struct pci_dev *pdev, int features) if (!pasid) return -EINVAL; + if (!pci_acs_path_enabled(pdev, NULL, PCI_ACS_RR | PCI_ACS_UF)) + return -EINVAL; + pci_read_config_word(pdev, pasid + PCI_PASID_CAP, &supported); supported &= PCI_PASID_CAP_EXEC | PCI_PASID_CAP_PRIV; diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 6ae4d012d800..560453d99c24 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -588,6 +588,9 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .ioctl = vfio_ccw_mdev_ioctl, .request = vfio_ccw_mdev_request, .dma_unmap = vfio_ccw_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; struct mdev_driver vfio_ccw_mdev_driver = { diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0b4cc8c597ae..68eeb25fb661 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -1535,13 +1535,29 @@ static int vfio_ap_mdev_set_kvm(struct ap_matrix_mdev *matrix_mdev, return 0; } +static void unmap_iova(struct ap_matrix_mdev *matrix_mdev, u64 iova, u64 length) +{ + struct ap_queue_table *qtable = &matrix_mdev->qtable; + struct vfio_ap_queue *q; + int loop_cursor; + + hash_for_each(qtable->queues, loop_cursor, q, mdev_qnode) { + if (q->saved_iova >= iova && q->saved_iova < iova + length) + vfio_ap_irq_disable(q); + } +} + static void vfio_ap_mdev_dma_unmap(struct vfio_device *vdev, u64 iova, u64 length) { struct ap_matrix_mdev *matrix_mdev = container_of(vdev, struct ap_matrix_mdev, vdev); - vfio_unpin_pages(&matrix_mdev->vdev, iova, 1); + mutex_lock(&matrix_dev->mdevs_lock); + + unmap_iova(matrix_mdev, iova, length); + + mutex_unlock(&matrix_dev->mdevs_lock); } /** @@ -1789,6 +1805,9 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .close_device = vfio_ap_mdev_close_device, .ioctl = vfio_ap_mdev_ioctl, .dma_unmap = vfio_ap_mdev_dma_unmap, + .bind_iommufd = vfio_iommufd_emulated_bind, + .unbind_iommufd = vfio_iommufd_emulated_unbind, + .attach_ioas = vfio_iommufd_emulated_attach_ioas, }; static struct mdev_driver vfio_ap_matrix_driver = { diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 86c381ceb9a1..286c1663bd75 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -2,8 +2,9 @@ menuconfig VFIO tristate "VFIO Non-Privileged userspace driver framework" select IOMMU_API - select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + depends on IOMMUFD || !IOMMUFD select INTERVAL_TREE + select VFIO_CONTAINER if IOMMUFD=n help VFIO provides a framework for secure userspace device drivers. See Documentation/driver-api/vfio.rst for more details. @@ -11,6 +12,18 @@ menuconfig VFIO If you don't know what to do here, say N. if VFIO +config VFIO_CONTAINER + bool "Support for the VFIO container /dev/vfio/vfio" + select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + default y + help + The VFIO container is the classic interface to VFIO for establishing + IOMMU mappings. If N is selected here then IOMMUFD must be used to + manage the mappings. + + Unless testing IOMMUFD say Y here. + +if VFIO_CONTAINER config VFIO_IOMMU_TYPE1 tristate default n @@ -20,16 +33,6 @@ config VFIO_IOMMU_SPAPR_TCE depends on SPAPR_TCE_IOMMU default VFIO -config VFIO_SPAPR_EEH - tristate - depends on EEH && VFIO_IOMMU_SPAPR_TCE - default VFIO - -config VFIO_VIRQFD - tristate - select EVENTFD - default n - config VFIO_NOIOMMU bool "VFIO No-IOMMU support" help @@ -43,6 +46,17 @@ config VFIO_NOIOMMU this mode since there is no IOMMU to provide DMA translation. If you don't know what to do here, say N. +endif + +config VFIO_SPAPR_EEH + tristate + depends on EEH && VFIO_IOMMU_SPAPR_TCE + default VFIO + +config VFIO_VIRQFD + tristate + select EVENTFD + default n source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index b693a1169286..3783db7e8082 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -4,8 +4,10 @@ vfio_virqfd-y := virqfd.o obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ - iova_bitmap.o \ - container.o + group.o \ + iova_bitmap.o +vfio-$(CONFIG_IOMMUFD) += iommufd.o +vfio-$(CONFIG_VFIO_CONTAINER) += container.o obj-$(CONFIG_VFIO_VIRQFD) += vfio_virqfd.o obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index d74164abbf40..b7a9560ab25e 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -188,8 +188,9 @@ void vfio_device_container_unregister(struct vfio_device *device) device->group->container->iommu_data, device); } -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg) +static long +vfio_container_ioctl_check_extension(struct vfio_container *container, + unsigned long arg) { struct vfio_iommu_driver *driver; long ret = 0; @@ -511,14 +512,15 @@ void vfio_group_detach_container(struct vfio_group *group) vfio_container_put(container); } -int vfio_device_assign_container(struct vfio_device *device) +int vfio_group_use_container(struct vfio_group *group) { - struct vfio_group *group = device->group; - lockdep_assert_held(&group->group_lock); - if (!group->container || !group->container->iommu_driver || - WARN_ON(!group->container_users)) + /* + * The container fd has been assigned with VFIO_GROUP_SET_CONTAINER but + * VFIO_SET_IOMMU hasn't been done yet. + */ + if (!group->container->iommu_driver) return -EINVAL; if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) @@ -529,122 +531,56 @@ int vfio_device_assign_container(struct vfio_device *device) return 0; } -void vfio_device_unassign_container(struct vfio_device *device) +void vfio_group_unuse_container(struct vfio_group *group) { - mutex_lock(&device->group->group_lock); - WARN_ON(device->group->container_users <= 1); - device->group->container_users--; - fput(device->group->opened_file); - mutex_unlock(&device->group->group_lock); + lockdep_assert_held(&group->group_lock); + + WARN_ON(group->container_users <= 1); + group->container_users--; + fput(group->opened_file); } -/* - * Pin contiguous user pages and return their associated host pages for local - * domain only. - * @device [in] : device - * @iova [in] : starting IOVA of user pages to be pinned. - * @npage [in] : count of pages to be pinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - * @prot [in] : protection flags - * @pages[out] : array of host pages - * Return error or number of pages pinned. - * - * A driver may only call this function if the vfio_device was created - * by vfio_register_emulated_iommu_dev(). - */ -int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, - int npage, int prot, struct page **pages) +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) { - struct vfio_container *container; - struct vfio_group *group = device->group; - struct vfio_iommu_driver *driver; - int ret; - - if (!pages || !npage || !vfio_assert_device_open(device)) - return -EINVAL; + struct vfio_container *container = device->group->container; + struct iommu_group *iommu_group = device->group->iommu_group; + struct vfio_iommu_driver *driver = container->iommu_driver; if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) return -E2BIG; - /* group->container cannot change while a vfio device is open */ - container = group->container; - driver = container->iommu_driver; - if (likely(driver && driver->ops->pin_pages)) - ret = driver->ops->pin_pages(container->iommu_data, - group->iommu_group, iova, - npage, prot, pages); - else - ret = -ENOTTY; - - return ret; + if (unlikely(!driver || !driver->ops->pin_pages)) + return -ENOTTY; + return driver->ops->pin_pages(container->iommu_data, iommu_group, iova, + npage, prot, pages); } -EXPORT_SYMBOL(vfio_pin_pages); -/* - * Unpin contiguous host pages for local domain only. - * @device [in] : device - * @iova [in] : starting address of user pages to be unpinned. - * @npage [in] : count of pages to be unpinned. This count should not - * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. - */ -void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; + struct vfio_container *container = device->group->container; if (WARN_ON(npage <= 0 || npage > VFIO_PIN_PAGES_MAX_ENTRIES)) return; - if (WARN_ON(!vfio_assert_device_open(device))) - return; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; - - driver->ops->unpin_pages(container->iommu_data, iova, npage); + container->iommu_driver->ops->unpin_pages(container->iommu_data, iova, + npage); } -EXPORT_SYMBOL(vfio_unpin_pages); -/* - * This interface allows the CPUs to perform some sort of virtual DMA on - * behalf of the device. - * - * CPUs read/write from/into a range of IOVAs pointing to user space memory - * into/from a kernel buffer. - * - * As the read/write of user space memory is conducted via the CPUs and is - * not a real device DMA, it is not necessary to pin the user space memory. - * - * @device [in] : VFIO device - * @iova [in] : base IOVA of a user space buffer - * @data [in] : pointer to kernel buffer - * @len [in] : kernel buffer length - * @write : indicate read or write - * Return error code on failure or 0 on success. - */ -int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, - size_t len, bool write) +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) { - struct vfio_container *container; - struct vfio_iommu_driver *driver; - int ret = 0; - - if (!data || len <= 0 || !vfio_assert_device_open(device)) - return -EINVAL; - - /* group->container cannot change while a vfio device is open */ - container = device->group->container; - driver = container->iommu_driver; + struct vfio_container *container = device->group->container; + struct vfio_iommu_driver *driver = container->iommu_driver; - if (likely(driver && driver->ops->dma_rw)) - ret = driver->ops->dma_rw(container->iommu_data, - iova, data, len, write); - else - ret = -ENOTTY; - return ret; + if (unlikely(!driver || !driver->ops->dma_rw)) + return -ENOTTY; + return driver->ops->dma_rw(container->iommu_data, iova, data, len, + write); } -EXPORT_SYMBOL(vfio_dma_rw); int __init vfio_container_init(void) { @@ -678,3 +614,6 @@ void vfio_container_cleanup(void) misc_deregister(&vfio_dev); mutex_destroy(&vfio.iommu_drivers_lock); } + +MODULE_ALIAS_MISCDEV(VFIO_MINOR); +MODULE_ALIAS("devname:vfio/vfio"); diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index b16874e913e4..5cd4bb476440 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -592,6 +592,9 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .read = vfio_fsl_mc_read, .write = vfio_fsl_mc_write, .mmap = vfio_fsl_mc_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct fsl_mc_driver vfio_fsl_mc_driver = { diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c new file mode 100644 index 000000000000..c5d8bf10495e --- /dev/null +++ b/drivers/vfio/group.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO core + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * Derived from original vfio: + * Copyright 2010 Cisco Systems, Inc. All rights reserved. + * Author: Tom Lyon, pugs@cisco.com + */ + +#include <linux/vfio.h> +#include <linux/iommufd.h> +#include <linux/anon_inodes.h> +#include "vfio.h" + +static struct vfio { + struct class *class; + struct list_head group_list; + struct mutex group_lock; /* locks group_list */ + struct ida group_ida; + dev_t group_devt; +} vfio; + +static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, + char *buf) +{ + struct vfio_device *it, *device = ERR_PTR(-ENODEV); + + mutex_lock(&group->device_lock); + list_for_each_entry(it, &group->device_list, group_next) { + int ret; + + if (it->ops->match) { + ret = it->ops->match(it, buf); + if (ret < 0) { + device = ERR_PTR(ret); + break; + } + } else { + ret = !strcmp(dev_name(it->dev), buf); + } + + if (ret && vfio_device_try_get_registration(it)) { + device = it; + break; + } + } + mutex_unlock(&group->device_lock); + + return device; +} + +/* + * VFIO Group fd, /dev/vfio/$GROUP + */ +static bool vfio_group_has_iommu(struct vfio_group *group) +{ + lockdep_assert_held(&group->group_lock); + /* + * There can only be users if there is a container, and if there is a + * container there must be users. + */ + WARN_ON(!group->container != !group->container_users); + + return group->container || group->iommufd; +} + +/* + * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or + * if there was no container to unset. Since the ioctl is called on + * the group, we know that still exists, therefore the only valid + * transition here is 1->0. + */ +static int vfio_group_ioctl_unset_container(struct vfio_group *group) +{ + int ret = 0; + + mutex_lock(&group->group_lock); + if (!vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (group->container) { + if (group->container_users != 1) { + ret = -EBUSY; + goto out_unlock; + } + vfio_group_detach_container(group); + } + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_ioctl_set_container(struct vfio_group *group, + int __user *arg) +{ + struct vfio_container *container; + struct iommufd_ctx *iommufd; + struct fd f; + int ret; + int fd; + + if (get_user(fd, arg)) + return -EFAULT; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + mutex_lock(&group->group_lock); + if (vfio_group_has_iommu(group)) { + ret = -EINVAL; + goto out_unlock; + } + if (!group->iommu_group) { + ret = -ENODEV; + goto out_unlock; + } + + container = vfio_container_from_file(f.file); + if (container) { + ret = vfio_container_attach_group(container, group); + goto out_unlock; + } + + iommufd = iommufd_ctx_from_file(f.file); + if (!IS_ERR(iommufd)) { + u32 ioas_id; + + ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); + if (ret) { + iommufd_ctx_put(group->iommufd); + goto out_unlock; + } + + group->iommufd = iommufd; + goto out_unlock; + } + + /* The FD passed is not recognized. */ + ret = -EBADFD; + +out_unlock: + mutex_unlock(&group->group_lock); + fdput(f); + return ret; +} + +static int vfio_device_group_open(struct vfio_device *device) +{ + int ret; + + mutex_lock(&device->group->group_lock); + if (!vfio_group_has_iommu(device->group)) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Here we pass the KVM pointer with the group under the lock. If the + * device driver will use it, it must obtain a reference and release it + * during close_device. + */ + ret = vfio_device_open(device, device->group->iommufd, + device->group->kvm); + +out_unlock: + mutex_unlock(&device->group->group_lock); + return ret; +} + +void vfio_device_group_close(struct vfio_device *device) +{ + mutex_lock(&device->group->group_lock); + vfio_device_close(device, device->group->iommufd); + mutex_unlock(&device->group->group_lock); +} + +static struct file *vfio_device_open_file(struct vfio_device *device) +{ + struct file *filep; + int ret; + + ret = vfio_device_group_open(device); + if (ret) + goto err_out; + + /* + * We can't use anon_inode_getfd() because we need to modify + * the f_mode flags directly to allow more than just ioctls + */ + filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, + device, O_RDWR); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_close_device; + } + + /* + * TODO: add an anon_inode interface to do this. + * Appears to be missing by lack of need rather than + * explicitly prevented. Now there's need. + */ + filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); + + if (device->group->type == VFIO_NO_IOMMU) + dev_warn(device->dev, "vfio-noiommu device opened by user " + "(%s:%d)\n", current->comm, task_pid_nr(current)); + /* + * On success the ref of device is moved to the file and + * put in vfio_device_fops_release() + */ + return filep; + +err_close_device: + vfio_device_group_close(device); +err_out: + return ERR_PTR(ret); +} + +static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, + char __user *arg) +{ + struct vfio_device *device; + struct file *filep; + char *buf; + int fdno; + int ret; + + buf = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + device = vfio_device_get_from_name(group, buf); + kfree(buf); + if (IS_ERR(device)) + return PTR_ERR(device); + + fdno = get_unused_fd_flags(O_CLOEXEC); + if (fdno < 0) { + ret = fdno; + goto err_put_device; + } + + filep = vfio_device_open_file(device); + if (IS_ERR(filep)) { + ret = PTR_ERR(filep); + goto err_put_fdno; + } + + fd_install(fdno, filep); + return fdno; + +err_put_fdno: + put_unused_fd(fdno); +err_put_device: + vfio_device_put_registration(device); + return ret; +} + +static int vfio_group_ioctl_get_status(struct vfio_group *group, + struct vfio_group_status __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_group_status, flags); + struct vfio_group_status status; + + if (copy_from_user(&status, arg, minsz)) + return -EFAULT; + + if (status.argsz < minsz) + return -EINVAL; + + status.flags = 0; + + mutex_lock(&group->group_lock); + if (!group->iommu_group) { + mutex_unlock(&group->group_lock); + return -ENODEV; + } + + /* + * With the container FD the iommu_group_claim_dma_owner() is done + * during SET_CONTAINER but for IOMMFD this is done during + * VFIO_GROUP_GET_DEVICE_FD. Meaning that with iommufd + * VFIO_GROUP_FLAGS_VIABLE could be set but GET_DEVICE_FD will fail due + * to viability. + */ + if (vfio_group_has_iommu(group)) + status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | + VFIO_GROUP_FLAGS_VIABLE; + else if (!iommu_group_dma_owner_claimed(group->iommu_group)) + status.flags |= VFIO_GROUP_FLAGS_VIABLE; + mutex_unlock(&group->group_lock); + + if (copy_to_user(arg, &status, minsz)) + return -EFAULT; + return 0; +} + +static long vfio_group_fops_unl_ioctl(struct file *filep, + unsigned int cmd, unsigned long arg) +{ + struct vfio_group *group = filep->private_data; + void __user *uarg = (void __user *)arg; + + switch (cmd) { + case VFIO_GROUP_GET_DEVICE_FD: + return vfio_group_ioctl_get_device_fd(group, uarg); + case VFIO_GROUP_GET_STATUS: + return vfio_group_ioctl_get_status(group, uarg); + case VFIO_GROUP_SET_CONTAINER: + return vfio_group_ioctl_set_container(group, uarg); + case VFIO_GROUP_UNSET_CONTAINER: + return vfio_group_ioctl_unset_container(group); + default: + return -ENOTTY; + } +} + +static int vfio_group_fops_open(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = + container_of(inode->i_cdev, struct vfio_group, cdev); + int ret; + + mutex_lock(&group->group_lock); + + /* + * drivers can be zero if this races with vfio_device_remove_group(), it + * will be stable at 0 under the group rwsem + */ + if (refcount_read(&group->drivers) == 0) { + ret = -ENODEV; + goto out_unlock; + } + + if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { + ret = -EPERM; + goto out_unlock; + } + + /* + * Do we need multiple instances of the group open? Seems not. + */ + if (group->opened_file) { + ret = -EBUSY; + goto out_unlock; + } + group->opened_file = filep; + filep->private_data = group; + ret = 0; +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +static int vfio_group_fops_release(struct inode *inode, struct file *filep) +{ + struct vfio_group *group = filep->private_data; + + filep->private_data = NULL; + + mutex_lock(&group->group_lock); + /* + * Device FDs hold a group file reference, therefore the group release + * is only called when there are no open devices. + */ + WARN_ON(group->notifier.head); + if (group->container) + vfio_group_detach_container(group); + if (group->iommufd) { + iommufd_ctx_put(group->iommufd); + group->iommufd = NULL; + } + group->opened_file = NULL; + mutex_unlock(&group->group_lock); + return 0; +} + +static const struct file_operations vfio_group_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = vfio_group_fops_unl_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .open = vfio_group_fops_open, + .release = vfio_group_fops_release, +}; + +/* + * Group objects - create, release, get, put, search + */ +static struct vfio_group * +vfio_group_find_from_iommu(struct iommu_group *iommu_group) +{ + struct vfio_group *group; + + lockdep_assert_held(&vfio.group_lock); + + /* + * group->iommu_group from the vfio.group_list cannot be NULL + * under the vfio.group_lock. + */ + list_for_each_entry(group, &vfio.group_list, vfio_next) { + if (group->iommu_group == iommu_group) + return group; + } + return NULL; +} + +static void vfio_group_release(struct device *dev) +{ + struct vfio_group *group = container_of(dev, struct vfio_group, dev); + + mutex_destroy(&group->device_lock); + mutex_destroy(&group->group_lock); + WARN_ON(group->iommu_group); + ida_free(&vfio.group_ida, MINOR(group->dev.devt)); + kfree(group); +} + +static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + int minor; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); + if (minor < 0) { + kfree(group); + return ERR_PTR(minor); + } + + device_initialize(&group->dev); + group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); + group->dev.class = vfio.class; + group->dev.release = vfio_group_release; + cdev_init(&group->cdev, &vfio_group_fops); + group->cdev.owner = THIS_MODULE; + + refcount_set(&group->drivers, 1); + mutex_init(&group->group_lock); + INIT_LIST_HEAD(&group->device_list); + mutex_init(&group->device_lock); + group->iommu_group = iommu_group; + /* put in vfio_group_release() */ + iommu_group_ref_get(iommu_group); + group->type = type; + BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); + + return group; +} + +static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, + enum vfio_group_type type) +{ + struct vfio_group *group; + struct vfio_group *ret; + int err; + + lockdep_assert_held(&vfio.group_lock); + + group = vfio_group_alloc(iommu_group, type); + if (IS_ERR(group)) + return group; + + err = dev_set_name(&group->dev, "%s%d", + group->type == VFIO_NO_IOMMU ? "noiommu-" : "", + iommu_group_id(iommu_group)); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + err = cdev_device_add(&group->cdev, &group->dev); + if (err) { + ret = ERR_PTR(err); + goto err_put; + } + + list_add(&group->vfio_next, &vfio.group_list); + + return group; + +err_put: + put_device(&group->dev); + return ret; +} + +static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, + enum vfio_group_type type) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + int ret; + + iommu_group = iommu_group_alloc(); + if (IS_ERR(iommu_group)) + return ERR_CAST(iommu_group); + + ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); + if (ret) + goto out_put_group; + ret = iommu_group_add_device(iommu_group, dev); + if (ret) + goto out_put_group; + + mutex_lock(&vfio.group_lock); + group = vfio_create_group(iommu_group, type); + mutex_unlock(&vfio.group_lock); + if (IS_ERR(group)) { + ret = PTR_ERR(group); + goto out_remove_device; + } + iommu_group_put(iommu_group); + return group; + +out_remove_device: + iommu_group_remove_device(dev); +out_put_group: + iommu_group_put(iommu_group); + return ERR_PTR(ret); +} + +static bool vfio_group_has_device(struct vfio_group *group, struct device *dev) +{ + struct vfio_device *device; + + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (device->dev == dev) { + mutex_unlock(&group->device_lock); + return true; + } + } + mutex_unlock(&group->device_lock); + return false; +} + +static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = iommu_group_get(dev); + if (!iommu_group && vfio_noiommu) { + /* + * With noiommu enabled, create an IOMMU group for devices that + * don't already have one, implying no IOMMU hardware/driver + * exists. Taint the kernel because we're about to give a DMA + * capable device to a user without IOMMU protection. + */ + group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); + if (!IS_ERR(group)) { + add_taint(TAINT_USER, LOCKDEP_STILL_OK); + dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); + } + return group; + } + + if (!iommu_group) + return ERR_PTR(-EINVAL); + + /* + * VFIO always sets IOMMU_CACHE because we offer no way for userspace to + * restore cache coherency. It has to be checked here because it is only + * valid for cases where we are using iommu groups. + */ + if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { + iommu_group_put(iommu_group); + return ERR_PTR(-EINVAL); + } + + mutex_lock(&vfio.group_lock); + group = vfio_group_find_from_iommu(iommu_group); + if (group) { + if (WARN_ON(vfio_group_has_device(group, dev))) + group = ERR_PTR(-EINVAL); + else + refcount_inc(&group->drivers); + } else { + group = vfio_create_group(iommu_group, VFIO_IOMMU); + } + mutex_unlock(&vfio.group_lock); + + /* The vfio_group holds a reference to the iommu_group */ + iommu_group_put(iommu_group); + return group; +} + +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type) +{ + struct vfio_group *group; + + if (type == VFIO_IOMMU) + group = vfio_group_find_or_alloc(device->dev); + else + group = vfio_noiommu_group_alloc(device->dev, type); + + if (IS_ERR(group)) + return PTR_ERR(group); + + /* Our reference on group is moved to the device */ + device->group = group; + return 0; +} + +void vfio_device_remove_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + struct iommu_group *iommu_group; + + if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) + iommu_group_remove_device(device->dev); + + /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ + if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) + return; + list_del(&group->vfio_next); + + /* + * We could concurrently probe another driver in the group that might + * race vfio_device_remove_group() with vfio_get_group(), so we have to + * ensure that the sysfs is all cleaned up under lock otherwise the + * cdev_device_add() will fail due to the name aready existing. + */ + cdev_device_del(&group->cdev, &group->dev); + + mutex_lock(&group->group_lock); + /* + * These data structures all have paired operations that can only be + * undone when the caller holds a live reference on the device. Since + * all pairs must be undone these WARN_ON's indicate some caller did not + * properly hold the group reference. + */ + WARN_ON(!list_empty(&group->device_list)); + WARN_ON(group->notifier.head); + + /* + * Revoke all users of group->iommu_group. At this point we know there + * are no devices active because we are unplugging the last one. Setting + * iommu_group to NULL blocks all new users. + */ + if (group->container) + vfio_group_detach_container(group); + iommu_group = group->iommu_group; + group->iommu_group = NULL; + mutex_unlock(&group->group_lock); + mutex_unlock(&vfio.group_lock); + + iommu_group_put(iommu_group); + put_device(&group->dev); +} + +void vfio_device_group_register(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_add(&device->group_next, &device->group->device_list); + mutex_unlock(&device->group->device_lock); +} + +void vfio_device_group_unregister(struct vfio_device *device) +{ + mutex_lock(&device->group->device_lock); + list_del(&device->group_next); + mutex_unlock(&device->group->device_lock); +} + +int vfio_device_group_use_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + int ret = 0; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return -EINVAL; + + ret = vfio_group_use_container(group); + if (ret) + return ret; + vfio_device_container_register(device); + return 0; +} + +void vfio_device_group_unuse_iommu(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + lockdep_assert_held(&group->group_lock); + + if (WARN_ON(!group->container)) + return; + + vfio_device_container_unregister(device); + vfio_group_unuse_container(group); +} + +bool vfio_device_has_container(struct vfio_device *device) +{ + return device->group->container; +} + +/** + * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file + * @file: VFIO group file + * + * The returned iommu_group is valid as long as a ref is held on the file. This + * returns a reference on the group. This function is deprecated, only the SPAPR + * path in kvm should call it. + */ +struct iommu_group *vfio_file_iommu_group(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct iommu_group *iommu_group = NULL; + + if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) + return NULL; + + if (!vfio_file_is_group(file)) + return NULL; + + mutex_lock(&group->group_lock); + if (group->iommu_group) { + iommu_group = group->iommu_group; + iommu_group_ref_get(iommu_group); + } + mutex_unlock(&group->group_lock); + return iommu_group; +} +EXPORT_SYMBOL_GPL(vfio_file_iommu_group); + +/** + * vfio_file_is_group - True if the file is usable with VFIO aPIS + * @file: VFIO group file + */ +bool vfio_file_is_group(struct file *file) +{ + return file->f_op == &vfio_group_fops; +} +EXPORT_SYMBOL_GPL(vfio_file_is_group); + +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file + * + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. + */ +bool vfio_file_enforced_coherent(struct file *file) +{ + struct vfio_group *group = file->private_data; + struct vfio_device *device; + bool ret = true; + + if (!vfio_file_is_group(file)) + return true; + + /* + * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then + * any domain later attached to it will also not support it. If the cap + * is set then the iommu_domain eventually attached to the device/group + * must use a domain with enforce_cache_coherency(). + */ + mutex_lock(&group->device_lock); + list_for_each_entry(device, &group->device_list, group_next) { + if (!device_iommu_capable(device->dev, + IOMMU_CAP_ENFORCE_CACHE_COHERENCY)) { + ret = false; + break; + } + } + mutex_unlock(&group->device_lock); + return ret; +} +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); + +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file + * @kvm: KVM to link + * + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the group. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return; + + mutex_lock(&group->group_lock); + group->kvm = kvm; + mutex_unlock(&group->group_lock); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + +/** + * vfio_file_has_dev - True if the VFIO file is a handle for device + * @file: VFIO file to check + * @device: Device that must be part of the file + * + * Returns true if given file has permission to manipulate the given device. + */ +bool vfio_file_has_dev(struct file *file, struct vfio_device *device) +{ + struct vfio_group *group = file->private_data; + + if (!vfio_file_is_group(file)) + return false; + + return group == device->group; +} +EXPORT_SYMBOL_GPL(vfio_file_has_dev); + +static char *vfio_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); +} + +int __init vfio_group_init(void) +{ + int ret; + + ida_init(&vfio.group_ida); + mutex_init(&vfio.group_lock); + INIT_LIST_HEAD(&vfio.group_list); + + ret = vfio_container_init(); + if (ret) + return ret; + + /* /dev/vfio/$GROUP */ + vfio.class = class_create(THIS_MODULE, "vfio"); + if (IS_ERR(vfio.class)) { + ret = PTR_ERR(vfio.class); + goto err_group_class; + } + + vfio.class->devnode = vfio_devnode; + + ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); + if (ret) + goto err_alloc_chrdev; + return 0; + +err_alloc_chrdev: + class_destroy(vfio.class); + vfio.class = NULL; +err_group_class: + vfio_container_cleanup(); + return ret; +} + +void vfio_group_cleanup(void) +{ + WARN_ON(!list_empty(&vfio.group_list)); + ida_destroy(&vfio.group_ida); + unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); + class_destroy(vfio.class); + vfio.class = NULL; + vfio_container_cleanup(); +} diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c new file mode 100644 index 000000000000..4f82a6fa7c6c --- /dev/null +++ b/drivers/vfio/iommufd.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES + */ +#include <linux/vfio.h> +#include <linux/iommufd.h> + +#include "vfio.h" + +MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS(IOMMUFD_VFIO); + +int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) +{ + u32 ioas_id; + u32 device_id; + int ret; + + lockdep_assert_held(&vdev->dev_set->lock); + + /* + * If the driver doesn't provide this op then it means the device does + * not do DMA at all. So nothing to do. + */ + if (!vdev->ops->bind_iommufd) + return 0; + + ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); + if (ret) + return ret; + + ret = iommufd_vfio_compat_ioas_id(ictx, &ioas_id); + if (ret) + goto err_unbind; + ret = vdev->ops->attach_ioas(vdev, &ioas_id); + if (ret) + goto err_unbind; + + /* + * The legacy path has no way to return the device id or the selected + * pt_id + */ + return 0; + +err_unbind: + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); + return ret; +} + +void vfio_iommufd_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->ops->unbind_iommufd) + vdev->ops->unbind_iommufd(vdev); +} + +/* + * The physical standard ops mean that the iommufd_device is bound to the + * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers + * using this ops set should call vfio_register_group_dev() + */ +int vfio_iommufd_physical_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + struct iommufd_device *idev; + + idev = iommufd_device_bind(ictx, vdev->dev, out_device_id); + if (IS_ERR(idev)) + return PTR_ERR(idev); + vdev->iommufd_device = idev; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); + +void vfio_iommufd_physical_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_attached) { + iommufd_device_detach(vdev->iommufd_device); + vdev->iommufd_attached = false; + } + iommufd_device_unbind(vdev->iommufd_device); + vdev->iommufd_device = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_unbind); + +int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + int rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + if (rc) + return rc; + vdev->iommufd_attached = true; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); + +/* + * The emulated standard ops mean that vfio_device is going to use the + * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this + * ops set should call vfio_register_emulated_iommu_dev(). + */ + +static void vfio_emulated_unmap(void *data, unsigned long iova, + unsigned long length) +{ + struct vfio_device *vdev = data; + + vdev->ops->dma_unmap(vdev, iova, length); +} + +static const struct iommufd_access_ops vfio_user_ops = { + .needs_pin_pages = 1, + .unmap = vfio_emulated_unmap, +}; + +int vfio_iommufd_emulated_bind(struct vfio_device *vdev, + struct iommufd_ctx *ictx, u32 *out_device_id) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + vdev->iommufd_ictx = ictx; + iommufd_ctx_get(ictx); + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_bind); + +void vfio_iommufd_emulated_unbind(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (vdev->iommufd_access) { + iommufd_access_destroy(vdev->iommufd_access); + vdev->iommufd_access = NULL; + } + iommufd_ctx_put(vdev->iommufd_ictx); + vdev->iommufd_ictx = NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_unbind); + +int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) +{ + struct iommufd_access *user; + + lockdep_assert_held(&vdev->dev_set->lock); + + user = iommufd_access_create(vdev->iommufd_ictx, *pt_id, &vfio_user_ops, + vdev); + if (IS_ERR(user)) + return PTR_ERR(user); + vdev->iommufd_access = user; + return 0; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas); diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 39eeca18a0f7..40019b11c5a9 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1246,6 +1246,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .mmap = hisi_acc_vfio_pci_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1261,6 +1264,9 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fd6ccb8454a2..32d1f38d351e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -623,6 +623,9 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 1d4919edfbde..29091ee2e984 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -138,6 +138,9 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index eaea63e5294c..5a046098d0bd 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -117,6 +117,9 @@ static const struct vfio_device_ops vfio_amba_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static const struct amba_id pl330_ids[] = { diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 82cedcebfd90..b87c3b708783 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -106,6 +106,9 @@ static const struct vfio_device_ops vfio_platform_ops = { .read = vfio_platform_read, .write = vfio_platform_write, .mmap = vfio_platform_mmap, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, }; static struct platform_driver vfio_platform_driver = { diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index bcad54bbab08..2e05418fd18d 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -6,14 +6,25 @@ #ifndef __VFIO_VFIO_H__ #define __VFIO_VFIO_H__ +#include <linux/file.h> #include <linux/device.h> #include <linux/cdev.h> #include <linux/module.h> +struct iommufd_ctx; struct iommu_group; struct vfio_device; struct vfio_container; +void vfio_device_put_registration(struct vfio_device *device); +bool vfio_device_try_get_registration(struct vfio_device *device); +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm); +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd); + +extern const struct file_operations vfio_device_fops; + enum vfio_group_type { /* * Physical device with IOMMU backing. @@ -54,14 +65,30 @@ struct vfio_group { struct list_head device_list; struct mutex device_lock; struct list_head vfio_next; +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) struct list_head container_next; +#endif enum vfio_group_type type; struct mutex group_lock; struct kvm *kvm; struct file *opened_file; struct blocking_notifier_head notifier; + struct iommufd_ctx *iommufd; }; +int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type); +void vfio_device_remove_group(struct vfio_device *device); +void vfio_device_group_register(struct vfio_device *device); +void vfio_device_group_unregister(struct vfio_device *device); +int vfio_device_group_use_iommu(struct vfio_device *device); +void vfio_device_group_unuse_iommu(struct vfio_device *device); +void vfio_device_group_close(struct vfio_device *device); +bool vfio_device_has_container(struct vfio_device *device); +int __init vfio_group_init(void); +void vfio_group_cleanup(void); + +#if IS_ENABLED(CONFIG_VFIO_CONTAINER) /* events for the backend driver notify callback */ enum vfio_iommu_notify_type { VFIO_IOMMU_CONTAINER_CLOSE = 0, @@ -109,20 +136,101 @@ struct vfio_iommu_driver { int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops); -bool vfio_assert_device_open(struct vfio_device *device); - struct vfio_container *vfio_container_from_file(struct file *filep); -int vfio_device_assign_container(struct vfio_device *device); -void vfio_device_unassign_container(struct vfio_device *device); +int vfio_group_use_container(struct vfio_group *group); +void vfio_group_unuse_container(struct vfio_group *group); int vfio_container_attach_group(struct vfio_container *container, struct vfio_group *group); void vfio_group_detach_container(struct vfio_group *group); void vfio_device_container_register(struct vfio_device *device); void vfio_device_container_unregister(struct vfio_device *device); -long vfio_container_ioctl_check_extension(struct vfio_container *container, - unsigned long arg); +int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages); +void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage); +int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write); + int __init vfio_container_init(void); void vfio_container_cleanup(void); +#else +static inline struct vfio_container * +vfio_container_from_file(struct file *filep) +{ + return NULL; +} + +static inline int vfio_group_use_container(struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_unuse_container(struct vfio_group *group) +{ +} + +static inline int vfio_container_attach_group(struct vfio_container *container, + struct vfio_group *group) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_group_detach_container(struct vfio_group *group) +{ +} + +static inline void vfio_device_container_register(struct vfio_device *device) +{ +} + +static inline void vfio_device_container_unregister(struct vfio_device *device) +{ +} + +static inline int vfio_device_container_pin_pages(struct vfio_device *device, + dma_addr_t iova, int npage, + int prot, struct page **pages) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_device_container_unpin_pages(struct vfio_device *device, + dma_addr_t iova, int npage) +{ +} + +static inline int vfio_device_container_dma_rw(struct vfio_device *device, + dma_addr_t iova, void *data, + size_t len, bool write) +{ + return -EOPNOTSUPP; +} + +static inline int vfio_container_init(void) +{ + return 0; +} +static inline void vfio_container_cleanup(void) +{ +} +#endif + +#if IS_ENABLED(CONFIG_IOMMUFD) +int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); +void vfio_iommufd_unbind(struct vfio_device *device); +#else +static inline int vfio_iommufd_bind(struct vfio_device *device, + struct iommufd_ctx *ictx) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_iommufd_unbind(struct vfio_device *device) +{ +} +#endif #ifdef CONFIG_VFIO_NOIOMMU extern bool vfio_noiommu __read_mostly; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 6e8804fe0095..e21ff965141e 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -13,8 +13,6 @@ #include <linux/cdev.h> #include <linux/compat.h> #include <linux/device.h> -#include <linux/file.h> -#include <linux/anon_inodes.h> #include <linux/fs.h> #include <linux/idr.h> #include <linux/iommu.h> @@ -35,6 +33,7 @@ #include <linux/pm_runtime.h> #include <linux/interval_tree.h> #include <linux/iova_bitmap.h> +#include <linux/iommufd.h> #include "vfio.h" #define DRIVER_VERSION "0.3" @@ -42,17 +41,11 @@ #define DRIVER_DESC "VFIO - User Level meta-driver" static struct vfio { - struct class *class; - struct list_head group_list; - struct mutex group_lock; /* locks group_list */ - struct ida group_ida; - dev_t group_devt; struct class *device_class; struct ida device_ida; } vfio; static DEFINE_XARRAY(vfio_device_set_xa); -static const struct file_operations vfio_group_fops; int vfio_assign_device_set(struct vfio_device *device, void *set_id) { @@ -139,207 +132,20 @@ unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) EXPORT_SYMBOL_GPL(vfio_device_set_open_count); /* - * Group objects - create, release, get, put, search - */ -static struct vfio_group * -__vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - /* - * group->iommu_group from the vfio.group_list cannot be NULL - * under the vfio.group_lock. - */ - list_for_each_entry(group, &vfio.group_list, vfio_next) { - if (group->iommu_group == iommu_group) { - refcount_inc(&group->drivers); - return group; - } - } - return NULL; -} - -static struct vfio_group * -vfio_group_get_from_iommu(struct iommu_group *iommu_group) -{ - struct vfio_group *group; - - mutex_lock(&vfio.group_lock); - group = __vfio_group_get_from_iommu(iommu_group); - mutex_unlock(&vfio.group_lock); - return group; -} - -static void vfio_group_release(struct device *dev) -{ - struct vfio_group *group = container_of(dev, struct vfio_group, dev); - - mutex_destroy(&group->device_lock); - mutex_destroy(&group->group_lock); - WARN_ON(group->iommu_group); - ida_free(&vfio.group_ida, MINOR(group->dev.devt)); - kfree(group); -} - -static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, - enum vfio_group_type type) -{ - struct vfio_group *group; - int minor; - - group = kzalloc(sizeof(*group), GFP_KERNEL); - if (!group) - return ERR_PTR(-ENOMEM); - - minor = ida_alloc_max(&vfio.group_ida, MINORMASK, GFP_KERNEL); - if (minor < 0) { - kfree(group); - return ERR_PTR(minor); - } - - device_initialize(&group->dev); - group->dev.devt = MKDEV(MAJOR(vfio.group_devt), minor); - group->dev.class = vfio.class; - group->dev.release = vfio_group_release; - cdev_init(&group->cdev, &vfio_group_fops); - group->cdev.owner = THIS_MODULE; - - refcount_set(&group->drivers, 1); - mutex_init(&group->group_lock); - INIT_LIST_HEAD(&group->device_list); - mutex_init(&group->device_lock); - group->iommu_group = iommu_group; - /* put in vfio_group_release() */ - iommu_group_ref_get(iommu_group); - group->type = type; - BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); - - return group; -} - -static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group, - enum vfio_group_type type) -{ - struct vfio_group *group; - struct vfio_group *ret; - int err; - - group = vfio_group_alloc(iommu_group, type); - if (IS_ERR(group)) - return group; - - err = dev_set_name(&group->dev, "%s%d", - group->type == VFIO_NO_IOMMU ? "noiommu-" : "", - iommu_group_id(iommu_group)); - if (err) { - ret = ERR_PTR(err); - goto err_put; - } - - mutex_lock(&vfio.group_lock); - - /* Did we race creating this group? */ - ret = __vfio_group_get_from_iommu(iommu_group); - if (ret) - goto err_unlock; - - err = cdev_device_add(&group->cdev, &group->dev); - if (err) { - ret = ERR_PTR(err); - goto err_unlock; - } - - list_add(&group->vfio_next, &vfio.group_list); - - mutex_unlock(&vfio.group_lock); - return group; - -err_unlock: - mutex_unlock(&vfio.group_lock); -err_put: - put_device(&group->dev); - return ret; -} - -static void vfio_device_remove_group(struct vfio_device *device) -{ - struct vfio_group *group = device->group; - struct iommu_group *iommu_group; - - if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU) - iommu_group_remove_device(device->dev); - - /* Pairs with vfio_create_group() / vfio_group_get_from_iommu() */ - if (!refcount_dec_and_mutex_lock(&group->drivers, &vfio.group_lock)) - return; - list_del(&group->vfio_next); - - /* - * We could concurrently probe another driver in the group that might - * race vfio_device_remove_group() with vfio_get_group(), so we have to - * ensure that the sysfs is all cleaned up under lock otherwise the - * cdev_device_add() will fail due to the name aready existing. - */ - cdev_device_del(&group->cdev, &group->dev); - - mutex_lock(&group->group_lock); - /* - * These data structures all have paired operations that can only be - * undone when the caller holds a live reference on the device. Since - * all pairs must be undone these WARN_ON's indicate some caller did not - * properly hold the group reference. - */ - WARN_ON(!list_empty(&group->device_list)); - WARN_ON(group->notifier.head); - - /* - * Revoke all users of group->iommu_group. At this point we know there - * are no devices active because we are unplugging the last one. Setting - * iommu_group to NULL blocks all new users. - */ - if (group->container) - vfio_group_detach_container(group); - iommu_group = group->iommu_group; - group->iommu_group = NULL; - mutex_unlock(&group->group_lock); - mutex_unlock(&vfio.group_lock); - - iommu_group_put(iommu_group); - put_device(&group->dev); -} - -/* * Device objects - create, release, get, put, search */ /* Device reference always implies a group reference */ -static void vfio_device_put_registration(struct vfio_device *device) +void vfio_device_put_registration(struct vfio_device *device) { if (refcount_dec_and_test(&device->refcount)) complete(&device->comp); } -static bool vfio_device_try_get_registration(struct vfio_device *device) +bool vfio_device_try_get_registration(struct vfio_device *device) { return refcount_inc_not_zero(&device->refcount); } -static struct vfio_device *vfio_group_get_device(struct vfio_group *group, - struct device *dev) -{ - struct vfio_device *device; - - mutex_lock(&group->device_lock); - list_for_each_entry(device, &group->device_list, group_next) { - if (device->dev == dev && - vfio_device_try_get_registration(device)) { - mutex_unlock(&group->device_lock); - return device; - } - } - mutex_unlock(&group->device_lock); - return NULL; -} - /* * VFIO driver API */ @@ -448,94 +254,15 @@ void vfio_free_device(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_free_device); -static struct vfio_group *vfio_noiommu_group_alloc(struct device *dev, - enum vfio_group_type type) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - int ret; - - iommu_group = iommu_group_alloc(); - if (IS_ERR(iommu_group)) - return ERR_CAST(iommu_group); - - ret = iommu_group_set_name(iommu_group, "vfio-noiommu"); - if (ret) - goto out_put_group; - ret = iommu_group_add_device(iommu_group, dev); - if (ret) - goto out_put_group; - - group = vfio_create_group(iommu_group, type); - if (IS_ERR(group)) { - ret = PTR_ERR(group); - goto out_remove_device; - } - iommu_group_put(iommu_group); - return group; - -out_remove_device: - iommu_group_remove_device(dev); -out_put_group: - iommu_group_put(iommu_group); - return ERR_PTR(ret); -} - -static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) -{ - struct iommu_group *iommu_group; - struct vfio_group *group; - - iommu_group = iommu_group_get(dev); - if (!iommu_group && vfio_noiommu) { - /* - * With noiommu enabled, create an IOMMU group for devices that - * don't already have one, implying no IOMMU hardware/driver - * exists. Taint the kernel because we're about to give a DMA - * capable device to a user without IOMMU protection. - */ - group = vfio_noiommu_group_alloc(dev, VFIO_NO_IOMMU); - if (!IS_ERR(group)) { - add_taint(TAINT_USER, LOCKDEP_STILL_OK); - dev_warn(dev, "Adding kernel taint for vfio-noiommu group on device\n"); - } - return group; - } - - if (!iommu_group) - return ERR_PTR(-EINVAL); - - /* - * VFIO always sets IOMMU_CACHE because we offer no way for userspace to - * restore cache coherency. It has to be checked here because it is only - * valid for cases where we are using iommu groups. - */ - if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { - iommu_group_put(iommu_group); - return ERR_PTR(-EINVAL); - } - - group = vfio_group_get_from_iommu(iommu_group); - if (!group) - group = vfio_create_group(iommu_group, VFIO_IOMMU); - - /* The vfio_group holds a reference to the iommu_group */ - iommu_group_put(iommu_group); - return group; -} - static int __vfio_register_dev(struct vfio_device *device, - struct vfio_group *group) + enum vfio_group_type type) { - struct vfio_device *existing_device; int ret; - /* - * In all cases group is the output of one of the group allocation - * functions and we have group->drivers incremented for us. - */ - if (IS_ERR(group)) - return PTR_ERR(group); + if (WARN_ON(device->ops->bind_iommufd && + (!device->ops->unbind_iommufd || + !device->ops->attach_ioas))) + return -EINVAL; /* * If the driver doesn't specify a set then the device is added to a @@ -544,25 +271,13 @@ static int __vfio_register_dev(struct vfio_device *device, if (!device->dev_set) vfio_assign_device_set(device, device); - existing_device = vfio_group_get_device(group, device->dev); - if (existing_device) { - /* - * group->iommu_group is non-NULL because we hold the drivers - * refcount. - */ - dev_WARN(device->dev, "Device already exists on group %d\n", - iommu_group_id(group->iommu_group)); - vfio_device_put_registration(existing_device); - ret = -EBUSY; - goto err_out; - } - - /* Our reference on group is moved to the device */ - device->group = group; - ret = dev_set_name(&device->device, "vfio%d", device->index); if (ret) - goto err_out; + return ret; + + ret = vfio_device_set_group(device, type); + if (ret) + return ret; ret = device_add(&device->device); if (ret) @@ -571,9 +286,7 @@ static int __vfio_register_dev(struct vfio_device *device, /* Refcounting can't start until the driver calls register */ refcount_set(&device->refcount, 1); - mutex_lock(&group->device_lock); - list_add(&device->group_next, &group->device_list); - mutex_unlock(&group->device_lock); + vfio_device_group_register(device); return 0; err_out: @@ -583,8 +296,7 @@ err_out: int vfio_register_group_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_group_find_or_alloc(device->dev)); + return __vfio_register_dev(device, VFIO_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_group_dev); @@ -594,46 +306,15 @@ EXPORT_SYMBOL_GPL(vfio_register_group_dev); */ int vfio_register_emulated_iommu_dev(struct vfio_device *device) { - return __vfio_register_dev(device, - vfio_noiommu_group_alloc(device->dev, VFIO_EMULATED_IOMMU)); + return __vfio_register_dev(device, VFIO_EMULATED_IOMMU); } EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev); -static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group, - char *buf) -{ - struct vfio_device *it, *device = ERR_PTR(-ENODEV); - - mutex_lock(&group->device_lock); - list_for_each_entry(it, &group->device_list, group_next) { - int ret; - - if (it->ops->match) { - ret = it->ops->match(it, buf); - if (ret < 0) { - device = ERR_PTR(ret); - break; - } - } else { - ret = !strcmp(dev_name(it->dev), buf); - } - - if (ret && vfio_device_try_get_registration(it)) { - device = it; - break; - } - } - mutex_unlock(&group->device_lock); - - return device; -} - /* * Decrement the device reference count and wait for the device to be * removed. Open file descriptors for the device... */ void vfio_unregister_group_dev(struct vfio_device *device) { - struct vfio_group *group = device->group; unsigned int i = 0; bool interrupted = false; long rc; @@ -661,333 +342,101 @@ void vfio_unregister_group_dev(struct vfio_device *device) } } - mutex_lock(&group->device_lock); - list_del(&device->group_next); - mutex_unlock(&group->device_lock); + vfio_device_group_unregister(device); /* Balances device_add in register path */ device_del(&device->device); + /* Balances vfio_device_set_group in register path */ vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); -/* - * VFIO Group fd, /dev/vfio/$GROUP - */ -/* - * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or - * if there was no container to unset. Since the ioctl is called on - * the group, we know that still exists, therefore the only valid - * transition here is 1->0. - */ -static int vfio_group_ioctl_unset_container(struct vfio_group *group) +/* true if the vfio_device has open_device() called but not close_device() */ +static bool vfio_assert_device_open(struct vfio_device *device) { - int ret = 0; - - mutex_lock(&group->group_lock); - if (!group->container) { - ret = -EINVAL; - goto out_unlock; - } - if (group->container_users != 1) { - ret = -EBUSY; - goto out_unlock; - } - vfio_group_detach_container(group); - -out_unlock: - mutex_unlock(&group->group_lock); - return ret; + return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_group_ioctl_set_container(struct vfio_group *group, - int __user *arg) +static int vfio_device_first_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { - struct vfio_container *container; - struct fd f; int ret; - int fd; - if (get_user(fd, arg)) - return -EFAULT; + lockdep_assert_held(&device->dev_set->lock); - f = fdget(fd); - if (!f.file) - return -EBADF; + if (!try_module_get(device->dev->driver->owner)) + return -ENODEV; - mutex_lock(&group->group_lock); - if (group->container || WARN_ON(group->container_users)) { - ret = -EINVAL; - goto out_unlock; - } - if (!group->iommu_group) { - ret = -ENODEV; - goto out_unlock; - } + if (iommufd) + ret = vfio_iommufd_bind(device, iommufd); + else + ret = vfio_device_group_use_iommu(device); + if (ret) + goto err_module_put; - container = vfio_container_from_file(f.file); - ret = -EINVAL; - if (container) { - ret = vfio_container_attach_group(container, group); - goto out_unlock; + device->kvm = kvm; + if (device->ops->open_device) { + ret = device->ops->open_device(device); + if (ret) + goto err_unuse_iommu; } + return 0; -out_unlock: - mutex_unlock(&group->group_lock); - fdput(f); +err_unuse_iommu: + device->kvm = NULL; + if (iommufd) + vfio_iommufd_unbind(device); + else + vfio_device_group_unuse_iommu(device); +err_module_put: + module_put(device->dev->driver->owner); return ret; } -static const struct file_operations vfio_device_fops; - -/* true if the vfio_device has open_device() called but not close_device() */ -bool vfio_assert_device_open(struct vfio_device *device) +static void vfio_device_last_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { - return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); + lockdep_assert_held(&device->dev_set->lock); + + if (device->ops->close_device) + device->ops->close_device(device); + device->kvm = NULL; + if (iommufd) + vfio_iommufd_unbind(device); + else + vfio_device_group_unuse_iommu(device); + module_put(device->dev->driver->owner); } -static struct file *vfio_device_open(struct vfio_device *device) +int vfio_device_open(struct vfio_device *device, + struct iommufd_ctx *iommufd, struct kvm *kvm) { - struct file *filep; - int ret; - - mutex_lock(&device->group->group_lock); - ret = vfio_device_assign_container(device); - mutex_unlock(&device->group->group_lock); - if (ret) - return ERR_PTR(ret); - - if (!try_module_get(device->dev->driver->owner)) { - ret = -ENODEV; - goto err_unassign_container; - } + int ret = 0; mutex_lock(&device->dev_set->lock); device->open_count++; if (device->open_count == 1) { - /* - * Here we pass the KVM pointer with the group under the read - * lock. If the device driver will use it, it must obtain a - * reference and release it during close_device. - */ - mutex_lock(&device->group->group_lock); - device->kvm = device->group->kvm; - - if (device->ops->open_device) { - ret = device->ops->open_device(device); - if (ret) - goto err_undo_count; - } - vfio_device_container_register(device); - mutex_unlock(&device->group->group_lock); - } - mutex_unlock(&device->dev_set->lock); - - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - device, O_RDWR); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_close_device; - } - - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - - if (device->group->type == VFIO_NO_IOMMU) - dev_warn(device->dev, "vfio-noiommu device opened by user " - "(%s:%d)\n", current->comm, task_pid_nr(current)); - /* - * On success the ref of device is moved to the file and - * put in vfio_device_fops_release() - */ - return filep; - -err_close_device: - mutex_lock(&device->dev_set->lock); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1) { - if (device->ops->close_device) - device->ops->close_device(device); - - vfio_device_container_unregister(device); + ret = vfio_device_first_open(device, iommufd, kvm); + if (ret) + device->open_count--; } -err_undo_count: - mutex_unlock(&device->group->group_lock); - device->open_count--; - if (device->open_count == 0 && device->kvm) - device->kvm = NULL; mutex_unlock(&device->dev_set->lock); - module_put(device->dev->driver->owner); -err_unassign_container: - vfio_device_unassign_container(device); - return ERR_PTR(ret); -} - -static int vfio_group_ioctl_get_device_fd(struct vfio_group *group, - char __user *arg) -{ - struct vfio_device *device; - struct file *filep; - char *buf; - int fdno; - int ret; - - buf = strndup_user(arg, PAGE_SIZE); - if (IS_ERR(buf)) - return PTR_ERR(buf); - - device = vfio_device_get_from_name(group, buf); - kfree(buf); - if (IS_ERR(device)) - return PTR_ERR(device); - - fdno = get_unused_fd_flags(O_CLOEXEC); - if (fdno < 0) { - ret = fdno; - goto err_put_device; - } - - filep = vfio_device_open(device); - if (IS_ERR(filep)) { - ret = PTR_ERR(filep); - goto err_put_fdno; - } - - fd_install(fdno, filep); - return fdno; - -err_put_fdno: - put_unused_fd(fdno); -err_put_device: - vfio_device_put_registration(device); - return ret; -} - -static int vfio_group_ioctl_get_status(struct vfio_group *group, - struct vfio_group_status __user *arg) -{ - unsigned long minsz = offsetofend(struct vfio_group_status, flags); - struct vfio_group_status status; - - if (copy_from_user(&status, arg, minsz)) - return -EFAULT; - - if (status.argsz < minsz) - return -EINVAL; - - status.flags = 0; - - mutex_lock(&group->group_lock); - if (!group->iommu_group) { - mutex_unlock(&group->group_lock); - return -ENODEV; - } - - if (group->container) - status.flags |= VFIO_GROUP_FLAGS_CONTAINER_SET | - VFIO_GROUP_FLAGS_VIABLE; - else if (!iommu_group_dma_owner_claimed(group->iommu_group)) - status.flags |= VFIO_GROUP_FLAGS_VIABLE; - mutex_unlock(&group->group_lock); - - if (copy_to_user(arg, &status, minsz)) - return -EFAULT; - return 0; -} - -static long vfio_group_fops_unl_ioctl(struct file *filep, - unsigned int cmd, unsigned long arg) -{ - struct vfio_group *group = filep->private_data; - void __user *uarg = (void __user *)arg; - - switch (cmd) { - case VFIO_GROUP_GET_DEVICE_FD: - return vfio_group_ioctl_get_device_fd(group, uarg); - case VFIO_GROUP_GET_STATUS: - return vfio_group_ioctl_get_status(group, uarg); - case VFIO_GROUP_SET_CONTAINER: - return vfio_group_ioctl_set_container(group, uarg); - case VFIO_GROUP_UNSET_CONTAINER: - return vfio_group_ioctl_unset_container(group); - default: - return -ENOTTY; - } -} - -static int vfio_group_fops_open(struct inode *inode, struct file *filep) -{ - struct vfio_group *group = - container_of(inode->i_cdev, struct vfio_group, cdev); - int ret; - - mutex_lock(&group->group_lock); - /* - * drivers can be zero if this races with vfio_device_remove_group(), it - * will be stable at 0 under the group rwsem - */ - if (refcount_read(&group->drivers) == 0) { - ret = -ENODEV; - goto out_unlock; - } - - if (group->type == VFIO_NO_IOMMU && !capable(CAP_SYS_RAWIO)) { - ret = -EPERM; - goto out_unlock; - } - - /* - * Do we need multiple instances of the group open? Seems not. - */ - if (group->opened_file) { - ret = -EBUSY; - goto out_unlock; - } - group->opened_file = filep; - filep->private_data = group; - ret = 0; -out_unlock: - mutex_unlock(&group->group_lock); return ret; } -static int vfio_group_fops_release(struct inode *inode, struct file *filep) +void vfio_device_close(struct vfio_device *device, + struct iommufd_ctx *iommufd) { - struct vfio_group *group = filep->private_data; - - filep->private_data = NULL; - - mutex_lock(&group->group_lock); - /* - * Device FDs hold a group file reference, therefore the group release - * is only called when there are no open devices. - */ - WARN_ON(group->notifier.head); - if (group->container) - vfio_group_detach_container(group); - group->opened_file = NULL; - mutex_unlock(&group->group_lock); - return 0; + mutex_lock(&device->dev_set->lock); + vfio_assert_device_open(device); + if (device->open_count == 1) + vfio_device_last_close(device, iommufd); + device->open_count--; + mutex_unlock(&device->dev_set->lock); } -static const struct file_operations vfio_group_fops = { - .owner = THIS_MODULE, - .unlocked_ioctl = vfio_group_fops_unl_ioctl, - .compat_ioctl = compat_ptr_ioctl, - .open = vfio_group_fops_open, - .release = vfio_group_fops_release, -}; - /* * Wrapper around pm_runtime_resume_and_get(). * Return error code on failure or 0 on success. @@ -1028,24 +477,7 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { struct vfio_device *device = filep->private_data; - mutex_lock(&device->dev_set->lock); - vfio_assert_device_open(device); - mutex_lock(&device->group->group_lock); - if (device->open_count == 1) { - if (device->ops->close_device) - device->ops->close_device(device); - - vfio_device_container_unregister(device); - } - mutex_unlock(&device->group->group_lock); - device->open_count--; - if (device->open_count == 0) - device->kvm = NULL; - mutex_unlock(&device->dev_set->lock); - - module_put(device->dev->driver->owner); - - vfio_device_unassign_container(device); + vfio_device_group_close(device); vfio_device_put_registration(device); @@ -1568,7 +1000,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) return device->ops->mmap(device, vma); } -static const struct file_operations vfio_device_fops = { +const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, .release = vfio_device_fops_release, .read = vfio_device_fops_read, @@ -1578,118 +1010,6 @@ static const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; -/** - * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file - * @file: VFIO group file - * - * The returned iommu_group is valid as long as a ref is held on the file. This - * returns a reference on the group. This function is deprecated, only the SPAPR - * path in kvm should call it. - */ -struct iommu_group *vfio_file_iommu_group(struct file *file) -{ - struct vfio_group *group = file->private_data; - struct iommu_group *iommu_group = NULL; - - if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) - return NULL; - - if (!vfio_file_is_group(file)) - return NULL; - - mutex_lock(&group->group_lock); - if (group->iommu_group) { - iommu_group = group->iommu_group; - iommu_group_ref_get(iommu_group); - } - mutex_unlock(&group->group_lock); - return iommu_group; -} -EXPORT_SYMBOL_GPL(vfio_file_iommu_group); - -/** - * vfio_file_is_group - True if the file is usable with VFIO aPIS - * @file: VFIO group file - */ -bool vfio_file_is_group(struct file *file) -{ - return file->f_op == &vfio_group_fops; -} -EXPORT_SYMBOL_GPL(vfio_file_is_group); - -/** - * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file - * is always CPU cache coherent - * @file: VFIO group file - * - * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop - * bit in DMA transactions. A return of false indicates that the user has - * rights to access additional instructions such as wbinvd on x86. - */ -bool vfio_file_enforced_coherent(struct file *file) -{ - struct vfio_group *group = file->private_data; - bool ret; - - if (!vfio_file_is_group(file)) - return true; - - mutex_lock(&group->group_lock); - if (group->container) { - ret = vfio_container_ioctl_check_extension(group->container, - VFIO_DMA_CC_IOMMU); - } else { - /* - * Since the coherency state is determined only once a container - * is attached the user must do so before they can prove they - * have permission. - */ - ret = true; - } - mutex_unlock(&group->group_lock); - return ret; -} -EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); - -/** - * vfio_file_set_kvm - Link a kvm with VFIO drivers - * @file: VFIO group file - * @kvm: KVM to link - * - * When a VFIO device is first opened the KVM will be available in - * device->kvm if one was associated with the group. - */ -void vfio_file_set_kvm(struct file *file, struct kvm *kvm) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return; - - mutex_lock(&group->group_lock); - group->kvm = kvm; - mutex_unlock(&group->group_lock); -} -EXPORT_SYMBOL_GPL(vfio_file_set_kvm); - -/** - * vfio_file_has_dev - True if the VFIO file is a handle for device - * @file: VFIO file to check - * @device: Device that must be part of the file - * - * Returns true if given file has permission to manipulate the given device. - */ -bool vfio_file_has_dev(struct file *file, struct vfio_device *device) -{ - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return false; - - return group == device->group; -} -EXPORT_SYMBOL_GPL(vfio_file_has_dev); - /* * Sub-module support */ @@ -1810,35 +1130,136 @@ int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); /* - * Module/class support + * Pin contiguous user pages and return their associated host pages for local + * domain only. + * @device [in] : device + * @iova [in] : starting IOVA of user pages to be pinned. + * @npage [in] : count of pages to be pinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @pages[out] : array of host pages + * Return error or number of pages pinned. + * + * A driver may only call this function if the vfio_device was created + * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages(). + */ +int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova, + int npage, int prot, struct page **pages) +{ + /* group->container cannot change while a vfio device is open */ + if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device))) + return -EINVAL; + if (vfio_device_has_container(device)) + return vfio_device_container_pin_pages(device, iova, + npage, prot, pages); + if (device->iommufd_access) { + int ret; + + if (iova > ULONG_MAX) + return -EINVAL; + /* + * VFIO ignores the sub page offset, npages is from the start of + * a PAGE_SIZE chunk of IOVA. The caller is expected to recover + * the sub page offset by doing: + * pages[0] + (iova % PAGE_SIZE) + */ + ret = iommufd_access_pin_pages( + device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE, pages, + (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0); + if (ret) + return ret; + return npage; + } + return -EINVAL; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin contiguous host pages for local domain only. + * @device [in] : device + * @iova [in] : starting address of user pages to be unpinned. + * @npage [in] : count of pages to be unpinned. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + */ +void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage) +{ + if (WARN_ON(!vfio_assert_device_open(device))) + return; + + if (vfio_device_has_container(device)) { + vfio_device_container_unpin_pages(device, iova, npage); + return; + } + if (device->iommufd_access) { + if (WARN_ON(iova > ULONG_MAX)) + return; + iommufd_access_unpin_pages(device->iommufd_access, + ALIGN_DOWN(iova, PAGE_SIZE), + npage * PAGE_SIZE); + return; + } +} +EXPORT_SYMBOL(vfio_unpin_pages); + +/* + * This interface allows the CPUs to perform some sort of virtual DMA on + * behalf of the device. + * + * CPUs read/write from/into a range of IOVAs pointing to user space memory + * into/from a kernel buffer. + * + * As the read/write of user space memory is conducted via the CPUs and is + * not a real device DMA, it is not necessary to pin the user space memory. + * + * @device [in] : VFIO device + * @iova [in] : base IOVA of a user space buffer + * @data [in] : pointer to kernel buffer + * @len [in] : kernel buffer length + * @write : indicate read or write + * Return error code on failure or 0 on success. */ -static char *vfio_devnode(struct device *dev, umode_t *mode) +int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data, + size_t len, bool write) { - return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev)); + if (!data || len <= 0 || !vfio_assert_device_open(device)) + return -EINVAL; + + if (vfio_device_has_container(device)) + return vfio_device_container_dma_rw(device, iova, + data, len, write); + + if (device->iommufd_access) { + unsigned int flags = 0; + + if (iova > ULONG_MAX) + return -EINVAL; + + /* VFIO historically tries to auto-detect a kthread */ + if (!current->mm) + flags |= IOMMUFD_ACCESS_RW_KTHREAD; + if (write) + flags |= IOMMUFD_ACCESS_RW_WRITE; + return iommufd_access_rw(device->iommufd_access, iova, data, + len, flags); + } + return -EINVAL; } +EXPORT_SYMBOL(vfio_dma_rw); +/* + * Module/class support + */ static int __init vfio_init(void) { int ret; - ida_init(&vfio.group_ida); ida_init(&vfio.device_ida); - mutex_init(&vfio.group_lock); - INIT_LIST_HEAD(&vfio.group_list); - ret = vfio_container_init(); + ret = vfio_group_init(); if (ret) return ret; - /* /dev/vfio/$GROUP */ - vfio.class = class_create(THIS_MODULE, "vfio"); - if (IS_ERR(vfio.class)) { - ret = PTR_ERR(vfio.class); - goto err_group_class; - } - - vfio.class->devnode = vfio_devnode; - /* /sys/class/vfio-dev/vfioX */ vfio.device_class = class_create(THIS_MODULE, "vfio-dev"); if (IS_ERR(vfio.device_class)) { @@ -1846,36 +1267,20 @@ static int __init vfio_init(void) goto err_dev_class; } - ret = alloc_chrdev_region(&vfio.group_devt, 0, MINORMASK + 1, "vfio"); - if (ret) - goto err_alloc_chrdev; - pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; -err_alloc_chrdev: - class_destroy(vfio.device_class); - vfio.device_class = NULL; err_dev_class: - class_destroy(vfio.class); - vfio.class = NULL; -err_group_class: - vfio_container_cleanup(); + vfio_group_cleanup(); return ret; } static void __exit vfio_cleanup(void) { - WARN_ON(!list_empty(&vfio.group_list)); - ida_destroy(&vfio.device_ida); - ida_destroy(&vfio.group_ida); - unregister_chrdev_region(vfio.group_devt, MINORMASK + 1); class_destroy(vfio.device_class); vfio.device_class = NULL; - class_destroy(vfio.class); - vfio_container_cleanup(); - vfio.class = NULL; + vfio_group_cleanup(); xa_destroy(&vfio_device_set_xa); } @@ -1886,6 +1291,4 @@ MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_ALIAS_MISCDEV(VFIO_MINOR); -MODULE_ALIAS("devname:vfio/vfio"); MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce"); |