summaryrefslogtreecommitdiff
path: root/drivers/iommu/amd_iommu.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-01-02 02:55:29 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2019-01-02 02:55:29 +0300
commit8e143b90e4d45cca3dc53760d3cfab988bc74571 (patch)
treecd924b3abd58786ce1f3f7a41f5f32ff9f3e6af7 /drivers/iommu/amd_iommu.c
parent78e8696c234ab637c4dd516cabeac344d84ec10b (diff)
parent03ebe48e235f17d70f34890d34d8153b8a84c02e (diff)
downloadlinux-8e143b90e4d45cca3dc53760d3cfab988bc74571.tar.xz
Merge tag 'iommu-updates-v4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
Pull IOMMU updates from Joerg Roedel: - Page table code for AMD IOMMU now supports large pages where smaller page-sizes were mapped before. VFIO had to work around that in the past and I included a patch to remove it (acked by Alex Williamson) - Patches to unmodularize a couple of IOMMU drivers that would never work as modules anyway. - Work to unify the the iommu-related pointers in 'struct device' into one pointer. This work is not finished yet, but will probably be in the next cycle. - NUMA aware allocation in iommu-dma code - Support for r8a774a1 and r8a774c0 in the Renesas IOMMU driver - Scalable mode support for the Intel VT-d driver - PM runtime improvements for the ARM-SMMU driver - Support for the QCOM-SMMUv2 IOMMU hardware from Qualcom - Various smaller fixes and improvements * tag 'iommu-updates-v4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: (78 commits) iommu: Check for iommu_ops == NULL in iommu_probe_device() ACPI/IORT: Don't call iommu_ops->add_device directly iommu/of: Don't call iommu_ops->add_device directly iommu: Consolitate ->add/remove_device() calls iommu/sysfs: Rename iommu_release_device() dmaengine: sh: rcar-dmac: Use device_iommu_mapped() xhci: Use device_iommu_mapped() powerpc/iommu: Use device_iommu_mapped() ACPI/IORT: Use device_iommu_mapped() iommu/of: Use device_iommu_mapped() driver core: Introduce device_iommu_mapped() function iommu/tegra: Use helper functions to access dev->iommu_fwspec iommu/qcom: Use helper functions to access dev->iommu_fwspec iommu/of: Use helper functions to access dev->iommu_fwspec iommu/mediatek: Use helper functions to access dev->iommu_fwspec iommu/ipmmu-vmsa: Use helper functions to access dev->iommu_fwspec iommu/dma: Use helper functions to access dev->iommu_fwspec iommu/arm-smmu: Use helper functions to access dev->iommu_fwspec ACPI/IORT: Use helper functions to access dev->iommu_fwspec iommu: Introduce wrappers around dev->iommu_fwspec ...
Diffstat (limited to 'drivers/iommu/amd_iommu.c')
-rw-r--r--drivers/iommu/amd_iommu.c275
1 files changed, 173 insertions, 102 deletions
diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 567221cca13c..87ba23a75b38 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -17,6 +17,8 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#define pr_fmt(fmt) "AMD-Vi: " fmt
+
#include <linux/ratelimit.h>
#include <linux/pci.h>
#include <linux/acpi.h>
@@ -277,7 +279,7 @@ static u16 get_alias(struct device *dev)
return pci_alias;
}
- pr_info("AMD-Vi: Using IVRS reported alias %02x:%02x.%d "
+ pr_info("Using IVRS reported alias %02x:%02x.%d "
"for device %s[%04x:%04x], kernel reported alias "
"%02x:%02x.%d\n", PCI_BUS_NUM(ivrs_alias), PCI_SLOT(ivrs_alias),
PCI_FUNC(ivrs_alias), dev_name(dev), pdev->vendor, pdev->device,
@@ -291,7 +293,7 @@ static u16 get_alias(struct device *dev)
if (pci_alias == devid &&
PCI_BUS_NUM(ivrs_alias) == pdev->bus->number) {
pci_add_dma_alias(pdev, ivrs_alias & 0xff);
- pr_info("AMD-Vi: Added PCI DMA alias %02x.%d for %s\n",
+ pr_info("Added PCI DMA alias %02x.%d for %s\n",
PCI_SLOT(ivrs_alias), PCI_FUNC(ivrs_alias),
dev_name(dev));
}
@@ -436,7 +438,14 @@ static int iommu_init_device(struct device *dev)
dev_data->alias = get_alias(dev);
- if (dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
+ /*
+ * By default we use passthrough mode for IOMMUv2 capable device.
+ * But if amd_iommu=force_isolation is set (e.g. to debug DMA to
+ * invalid address), we ignore the capability for the device so
+ * it'll be forced to go into translation mode.
+ */
+ if ((iommu_pass_through || !amd_iommu_force_isolation) &&
+ dev_is_pci(dev) && pci_iommuv2_capable(to_pci_dev(dev))) {
struct amd_iommu *iommu;
iommu = amd_iommu_rlookup_table[dev_data->devid];
@@ -511,7 +520,7 @@ static void dump_dte_entry(u16 devid)
int i;
for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: DTE[%d]: %016llx\n", i,
+ pr_err("DTE[%d]: %016llx\n", i,
amd_iommu_dev_table[devid].data[i]);
}
@@ -521,7 +530,7 @@ static void dump_command(unsigned long phys_addr)
int i;
for (i = 0; i < 4; ++i)
- pr_err("AMD-Vi: CMD[%d]: %08x\n", i, cmd->data[i]);
+ pr_err("CMD[%d]: %08x\n", i, cmd->data[i]);
}
static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
@@ -536,10 +545,10 @@ static void amd_iommu_report_page_fault(u16 devid, u16 domain_id,
dev_data = get_dev_data(&pdev->dev);
if (dev_data && __ratelimit(&dev_data->rs)) {
- dev_err(&pdev->dev, "AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ dev_err(&pdev->dev, "Event logged [IO_PAGE_FAULT domain=0x%04x address=0x%llx flags=0x%04x]\n",
domain_id, address, flags);
} else if (printk_ratelimit()) {
- pr_err("AMD-Vi: Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ pr_err("Event logged [IO_PAGE_FAULT device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
domain_id, address, flags);
}
@@ -566,7 +575,7 @@ retry:
if (type == 0) {
/* Did we hit the erratum? */
if (++count == LOOP_TIMEOUT) {
- pr_err("AMD-Vi: No event written to event log\n");
+ pr_err("No event written to event log\n");
return;
}
udelay(1);
@@ -576,43 +585,41 @@ retry:
if (type == EVENT_TYPE_IO_FAULT) {
amd_iommu_report_page_fault(devid, pasid, address, flags);
return;
- } else {
- dev_err(dev, "AMD-Vi: Event logged [");
}
switch (type) {
case EVENT_TYPE_ILL_DEV:
- dev_err(dev, "ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
pasid, address, flags);
dump_dte_entry(devid);
break;
case EVENT_TYPE_DEV_TAB_ERR:
- dev_err(dev, "DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
- "address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+ "address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
address, flags);
break;
case EVENT_TYPE_PAGE_TAB_ERR:
- dev_err(dev, "PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x domain=0x%04x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
pasid, address, flags);
break;
case EVENT_TYPE_ILL_CMD:
- dev_err(dev, "ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+ dev_err(dev, "Event logged [ILLEGAL_COMMAND_ERROR address=0x%llx]\n", address);
dump_command(address);
break;
case EVENT_TYPE_CMD_HARD_ERR:
- dev_err(dev, "COMMAND_HARDWARE_ERROR address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [COMMAND_HARDWARE_ERROR address=0x%llx flags=0x%04x]\n",
address, flags);
break;
case EVENT_TYPE_IOTLB_INV_TO:
- dev_err(dev, "IOTLB_INV_TIMEOUT device=%02x:%02x.%x address=0x%016llx]\n",
+ dev_err(dev, "Event logged [IOTLB_INV_TIMEOUT device=%02x:%02x.%x address=0x%llx]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
address);
break;
case EVENT_TYPE_INV_DEV_REQ:
- dev_err(dev, "INVALID_DEVICE_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [INVALID_DEVICE_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
pasid, address, flags);
break;
@@ -620,12 +627,12 @@ retry:
pasid = ((event[0] >> 16) & 0xFFFF)
| ((event[1] << 6) & 0xF0000);
tag = event[1] & 0x03FF;
- dev_err(dev, "INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%016llx flags=0x%04x]\n",
+ dev_err(dev, "Event logged [INVALID_PPR_REQUEST device=%02x:%02x.%x pasid=0x%05x address=0x%llx flags=0x%04x]\n",
PCI_BUS_NUM(devid), PCI_SLOT(devid), PCI_FUNC(devid),
pasid, address, flags);
break;
default:
- dev_err(dev, "UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
+ dev_err(dev, "Event logged [UNKNOWN event[0]=0x%08x event[1]=0x%08x event[2]=0x%08x event[3]=0x%08x\n",
event[0], event[1], event[2], event[3]);
}
@@ -652,7 +659,7 @@ static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u64 *raw)
struct amd_iommu_fault fault;
if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) {
- pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n");
+ pr_err_ratelimited("Unknown PPR request received\n");
return;
}
@@ -757,12 +764,12 @@ static void iommu_poll_ga_log(struct amd_iommu *iommu)
if (!iommu_ga_log_notifier)
break;
- pr_debug("AMD-Vi: %s: devid=%#x, ga_tag=%#x\n",
+ pr_debug("%s: devid=%#x, ga_tag=%#x\n",
__func__, GA_DEVID(log_entry),
GA_TAG(log_entry));
if (iommu_ga_log_notifier(GA_TAG(log_entry)) != 0)
- pr_err("AMD-Vi: GA log notifier failed.\n");
+ pr_err("GA log notifier failed.\n");
break;
default:
break;
@@ -787,18 +794,18 @@ irqreturn_t amd_iommu_int_thread(int irq, void *data)
iommu->mmio_base + MMIO_STATUS_OFFSET);
if (status & MMIO_STATUS_EVT_INT_MASK) {
- pr_devel("AMD-Vi: Processing IOMMU Event Log\n");
+ pr_devel("Processing IOMMU Event Log\n");
iommu_poll_events(iommu);
}
if (status & MMIO_STATUS_PPR_INT_MASK) {
- pr_devel("AMD-Vi: Processing IOMMU PPR Log\n");
+ pr_devel("Processing IOMMU PPR Log\n");
iommu_poll_ppr_log(iommu);
}
#ifdef CONFIG_IRQ_REMAP
if (status & MMIO_STATUS_GALOG_INT_MASK) {
- pr_devel("AMD-Vi: Processing IOMMU GA Log\n");
+ pr_devel("Processing IOMMU GA Log\n");
iommu_poll_ga_log(iommu);
}
#endif
@@ -842,7 +849,7 @@ static int wait_on_sem(volatile u64 *sem)
}
if (i == LOOP_TIMEOUT) {
- pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
+ pr_alert("Completion-Wait loop timed out\n");
return -EIO;
}
@@ -1034,7 +1041,7 @@ again:
/* Skip udelay() the first time around */
if (count++) {
if (count == LOOP_TIMEOUT) {
- pr_err("AMD-Vi: Command buffer timeout\n");
+ pr_err("Command buffer timeout\n");
return -EIO;
}
@@ -1315,6 +1322,101 @@ static void domain_flush_devices(struct protection_domain *domain)
*
****************************************************************************/
+static void free_page_list(struct page *freelist)
+{
+ while (freelist != NULL) {
+ unsigned long p = (unsigned long)page_address(freelist);
+ freelist = freelist->freelist;
+ free_page(p);
+ }
+}
+
+static struct page *free_pt_page(unsigned long pt, struct page *freelist)
+{
+ struct page *p = virt_to_page((void *)pt);
+
+ p->freelist = freelist;
+
+ return p;
+}
+
+#define DEFINE_FREE_PT_FN(LVL, FN) \
+static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist) \
+{ \
+ unsigned long p; \
+ u64 *pt; \
+ int i; \
+ \
+ pt = (u64 *)__pt; \
+ \
+ for (i = 0; i < 512; ++i) { \
+ /* PTE present? */ \
+ if (!IOMMU_PTE_PRESENT(pt[i])) \
+ continue; \
+ \
+ /* Large PTE? */ \
+ if (PM_PTE_LEVEL(pt[i]) == 0 || \
+ PM_PTE_LEVEL(pt[i]) == 7) \
+ continue; \
+ \
+ p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \
+ freelist = FN(p, freelist); \
+ } \
+ \
+ return free_pt_page((unsigned long)pt, freelist); \
+}
+
+DEFINE_FREE_PT_FN(l2, free_pt_page)
+DEFINE_FREE_PT_FN(l3, free_pt_l2)
+DEFINE_FREE_PT_FN(l4, free_pt_l3)
+DEFINE_FREE_PT_FN(l5, free_pt_l4)
+DEFINE_FREE_PT_FN(l6, free_pt_l5)
+
+static struct page *free_sub_pt(unsigned long root, int mode,
+ struct page *freelist)
+{
+ switch (mode) {
+ case PAGE_MODE_NONE:
+ case PAGE_MODE_7_LEVEL:
+ break;
+ case PAGE_MODE_1_LEVEL:
+ freelist = free_pt_page(root, freelist);
+ break;
+ case PAGE_MODE_2_LEVEL:
+ freelist = free_pt_l2(root, freelist);
+ break;
+ case PAGE_MODE_3_LEVEL:
+ freelist = free_pt_l3(root, freelist);
+ break;
+ case PAGE_MODE_4_LEVEL:
+ freelist = free_pt_l4(root, freelist);
+ break;
+ case PAGE_MODE_5_LEVEL:
+ freelist = free_pt_l5(root, freelist);
+ break;
+ case PAGE_MODE_6_LEVEL:
+ freelist = free_pt_l6(root, freelist);
+ break;
+ default:
+ BUG();
+ }
+
+ return freelist;
+}
+
+static void free_pagetable(struct protection_domain *domain)
+{
+ unsigned long root = (unsigned long)domain->pt_root;
+ struct page *freelist = NULL;
+
+ BUG_ON(domain->mode < PAGE_MODE_NONE ||
+ domain->mode > PAGE_MODE_6_LEVEL);
+
+ free_sub_pt(root, domain->mode, freelist);
+
+ free_page_list(freelist);
+}
+
/*
* This function is used to add another level to an IO page table. Adding
* another level increases the size of the address space by 9 bits to a size up
@@ -1363,10 +1465,13 @@ static u64 *alloc_pte(struct protection_domain *domain,
while (level > end_lvl) {
u64 __pte, __npte;
+ int pte_level;
- __pte = *pte;
+ __pte = *pte;
+ pte_level = PM_PTE_LEVEL(__pte);
- if (!IOMMU_PTE_PRESENT(__pte)) {
+ if (!IOMMU_PTE_PRESENT(__pte) ||
+ pte_level == PAGE_MODE_7_LEVEL) {
page = (u64 *)get_zeroed_page(gfp);
if (!page)
return NULL;
@@ -1374,19 +1479,21 @@ static u64 *alloc_pte(struct protection_domain *domain,
__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
/* pte could have been changed somewhere. */
- if (cmpxchg64(pte, __pte, __npte) != __pte) {
+ if (cmpxchg64(pte, __pte, __npte) != __pte)
free_page((unsigned long)page);
- continue;
- }
+ else if (pte_level == PAGE_MODE_7_LEVEL)
+ domain->updated = true;
+
+ continue;
}
/* No level skipping support yet */
- if (PM_PTE_LEVEL(*pte) != level)
+ if (pte_level != level)
return NULL;
level -= 1;
- pte = IOMMU_PTE_PAGE(*pte);
+ pte = IOMMU_PTE_PAGE(__pte);
if (pte_page && level == end_lvl)
*pte_page = pte;
@@ -1455,6 +1562,25 @@ static u64 *fetch_pte(struct protection_domain *domain,
return pte;
}
+static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist)
+{
+ unsigned long pt;
+ int mode;
+
+ while (cmpxchg64(pte, pteval, 0) != pteval) {
+ pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
+ pteval = *pte;
+ }
+
+ if (!IOMMU_PTE_PRESENT(pteval))
+ return freelist;
+
+ pt = (unsigned long)IOMMU_PTE_PAGE(pteval);
+ mode = IOMMU_PTE_MODE(pteval);
+
+ return free_sub_pt(pt, mode, freelist);
+}
+
/*
* Generic mapping functions. It maps a physical address into a DMA
* address space. It allocates the page table pages if necessary.
@@ -1469,6 +1595,7 @@ static int iommu_map_page(struct protection_domain *dom,
int prot,
gfp_t gfp)
{
+ struct page *freelist = NULL;
u64 __pte, *pte;
int i, count;
@@ -1485,8 +1612,10 @@ static int iommu_map_page(struct protection_domain *dom,
return -ENOMEM;
for (i = 0; i < count; ++i)
- if (IOMMU_PTE_PRESENT(pte[i]))
- return -EBUSY;
+ freelist = free_clear_pte(&pte[i], pte[i], freelist);
+
+ if (freelist != NULL)
+ dom->updated = true;
if (count > 1) {
__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@@ -1504,6 +1633,9 @@ static int iommu_map_page(struct protection_domain *dom,
update_domain(dom);
+ /* Everything flushed out, free pages now */
+ free_page_list(freelist);
+
return 0;
}
@@ -1636,67 +1768,6 @@ static void domain_id_free(int id)
spin_unlock(&pd_bitmap_lock);
}
-#define DEFINE_FREE_PT_FN(LVL, FN) \
-static void free_pt_##LVL (unsigned long __pt) \
-{ \
- unsigned long p; \
- u64 *pt; \
- int i; \
- \
- pt = (u64 *)__pt; \
- \
- for (i = 0; i < 512; ++i) { \
- /* PTE present? */ \
- if (!IOMMU_PTE_PRESENT(pt[i])) \
- continue; \
- \
- /* Large PTE? */ \
- if (PM_PTE_LEVEL(pt[i]) == 0 || \
- PM_PTE_LEVEL(pt[i]) == 7) \
- continue; \
- \
- p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \
- FN(p); \
- } \
- free_page((unsigned long)pt); \
-}
-
-DEFINE_FREE_PT_FN(l2, free_page)
-DEFINE_FREE_PT_FN(l3, free_pt_l2)
-DEFINE_FREE_PT_FN(l4, free_pt_l3)
-DEFINE_FREE_PT_FN(l5, free_pt_l4)
-DEFINE_FREE_PT_FN(l6, free_pt_l5)
-
-static void free_pagetable(struct protection_domain *domain)
-{
- unsigned long root = (unsigned long)domain->pt_root;
-
- switch (domain->mode) {
- case PAGE_MODE_NONE:
- break;
- case PAGE_MODE_1_LEVEL:
- free_page(root);
- break;
- case PAGE_MODE_2_LEVEL:
- free_pt_l2(root);
- break;
- case PAGE_MODE_3_LEVEL:
- free_pt_l3(root);
- break;
- case PAGE_MODE_4_LEVEL:
- free_pt_l4(root);
- break;
- case PAGE_MODE_5_LEVEL:
- free_pt_l5(root);
- break;
- case PAGE_MODE_6_LEVEL:
- free_pt_l6(root);
- break;
- default:
- BUG();
- }
-}
-
static void free_gcr3_tbl_level1(u64 *tbl)
{
u64 *ptr;
@@ -2771,9 +2842,9 @@ int __init amd_iommu_init_dma_ops(void)
iommu_detected = 1;
if (amd_iommu_unmap_flush)
- pr_info("AMD-Vi: IO/TLB flush on unmap enabled\n");
+ pr_info("IO/TLB flush on unmap enabled\n");
else
- pr_info("AMD-Vi: Lazy IO/TLB flushing enabled\n");
+ pr_info("Lazy IO/TLB flushing enabled\n");
return 0;
@@ -2878,7 +2949,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned type)
case IOMMU_DOMAIN_DMA:
dma_domain = dma_ops_domain_alloc();
if (!dma_domain) {
- pr_err("AMD-Vi: Failed to allocate\n");
+ pr_err("Failed to allocate\n");
return NULL;
}
pdomain = &dma_domain->domain;
@@ -4299,7 +4370,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
* legacy mode. So, we force legacy mode instead.
*/
if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) {
- pr_debug("AMD-Vi: %s: Fall back to using intr legacy remap\n",
+ pr_debug("%s: Fall back to using intr legacy remap\n",
__func__);
pi_data->is_guest_mode = false;
}