diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-03 23:03:52 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-01-03 23:03:52 +0300 |
commit | 269b012321f2f1f8e4648c43a93bf432b42c6668 (patch) | |
tree | e30ec565db50d4aec570cb52e9b29bd584beb2fd /arch/x86 | |
parent | f60a0a79846abed04ad5abddb5dafd14b66e1ab0 (diff) | |
parent | 065a6d68c71af2a3bdd080fa5aa353b76eede8f5 (diff) | |
download | linux-269b012321f2f1f8e4648c43a93bf432b42c6668.tar.xz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/linux-2.6-iommu: (89 commits)
AMD IOMMU: remove now unnecessary #ifdefs
AMD IOMMU: prealloc_protection_domains should be static
kvm/iommu: fix compile warning
AMD IOMMU: add statistics about total number of map requests
AMD IOMMU: add statistics about allocated io memory
AMD IOMMU: add stats counter for domain tlb flushes
AMD IOMMU: add stats counter for single iommu domain tlb flushes
AMD IOMMU: add stats counter for cross-page request
AMD IOMMU: add stats counter for free_coherent requests
AMD IOMMU: add stats counter for alloc_coherent requests
AMD IOMMU: add stats counter for unmap_sg requests
AMD IOMMU: add stats counter for map_sg requests
AMD IOMMU: add stats counter for unmap_single requests
AMD IOMMU: add stats counter for map_single requests
AMD IOMMU: add stats counter for completion wait events
AMD IOMMU: add init code for statistic collection
AMD IOMMU: add necessary header defines for stats counting
AMD IOMMU: add Kconfig entry for statistic collection code
AMD IOMMU: use dev_name in iommu_enable function
AMD IOMMU: use calc_devid in prealloc_protection_domains
...
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 13 | ||||
-rw-r--r-- | arch/x86/include/asm/amd_iommu_types.h | 61 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 2 | ||||
-rw-r--r-- | arch/x86/kernel/amd_iommu.c | 664 | ||||
-rw-r--r-- | arch/x86/kernel/amd_iommu_init.c | 15 | ||||
-rw-r--r-- | arch/x86/kvm/Makefile | 4 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 3 |
7 files changed, 672 insertions, 90 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 249d1e0824b5..862adb9bf0d4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -586,6 +586,16 @@ config AMD_IOMMU your BIOS for an option to enable it or if you have an IVRS ACPI table. +config AMD_IOMMU_STATS + bool "Export AMD IOMMU statistics to debugfs" + depends on AMD_IOMMU + select DEBUG_FS + help + This option enables code in the AMD IOMMU driver to collect various + statistics about whats happening in the driver and exports that + information to userspace via debugfs. + If unsure, say N. + # need this always selected by IOMMU for the VIA workaround config SWIOTLB def_bool y if X86_64 @@ -599,6 +609,9 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) +config IOMMU_API + def_bool (AMD_IOMMU || DMAR) + config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h index ac302a2fa339..95c8cd9d22b5 100644 --- a/arch/x86/include/asm/amd_iommu_types.h +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -190,16 +190,23 @@ /* FIXME: move this macro to <linux/pci.h> */ #define PCI_BUS(x) (((x) >> 8) & 0xff) +/* Protection domain flags */ +#define PD_DMA_OPS_MASK (1UL << 0) /* domain used for dma_ops */ +#define PD_DEFAULT_MASK (1UL << 1) /* domain is a default dma_ops + domain for an IOMMU */ + /* * This structure contains generic data for IOMMU protection domains * independent of their use. */ struct protection_domain { - spinlock_t lock; /* mostly used to lock the page table*/ - u16 id; /* the domain id written to the device table */ - int mode; /* paging mode (0-6 levels) */ - u64 *pt_root; /* page table root pointer */ - void *priv; /* private data */ + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + unsigned long flags; /* flags to find out type of domain */ + unsigned dev_cnt; /* devices assigned to this domain */ + void *priv; /* private data */ }; /* @@ -295,7 +302,7 @@ struct amd_iommu { bool int_enabled; /* if one, we need to send a completion wait command */ - int need_sync; + bool need_sync; /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; @@ -374,7 +381,7 @@ extern struct protection_domain **amd_iommu_pd_table; extern unsigned long *amd_iommu_pd_alloc_bitmap; /* will be 1 if device isolation is enabled */ -extern int amd_iommu_isolate; +extern bool amd_iommu_isolate; /* * If true, the addresses will be flushed on unmap time, not when @@ -382,18 +389,6 @@ extern int amd_iommu_isolate; */ extern bool amd_iommu_unmap_flush; -/* takes a PCI device id and prints it out in a readable form */ -static inline void print_devid(u16 devid, int nl) -{ - int bus = devid >> 8; - int dev = devid >> 3 & 0x1f; - int fn = devid & 0x07; - - printk("%02x:%02x.%x", bus, dev, fn); - if (nl) - printk("\n"); -} - /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) @@ -401,4 +396,32 @@ static inline u16 calc_devid(u8 bus, u8 devfn) return (((u16)bus) << 8) | devfn; } +#ifdef CONFIG_AMD_IOMMU_STATS + +struct __iommu_counter { + char *name; + struct dentry *dent; + u64 value; +}; + +#define DECLARE_STATS_COUNTER(nm) \ + static struct __iommu_counter nm = { \ + .name = #nm, \ + } + +#define INC_STATS_COUNTER(name) name.value += 1 +#define ADD_STATS_COUNTER(name, x) name.value += (x) +#define SUB_STATS_COUNTER(name, x) name.value -= (x) + +#else /* CONFIG_AMD_IOMMU_STATS */ + +#define DECLARE_STATS_COUNTER(name) +#define INC_STATS_COUNTER(name) +#define ADD_STATS_COUNTER(name, x) +#define SUB_STATS_COUNTER(name, x) + +static inline void amd_iommu_stats_init(void) { } + +#endif /* CONFIG_AMD_IOMMU_STATS */ + #endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 97215a458e5f..730843d1d2fb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -360,7 +360,7 @@ struct kvm_arch{ struct list_head active_mmu_pages; struct list_head assigned_dev_head; struct list_head oos_global_pages; - struct dmar_domain *intel_iommu_domain; + struct iommu_domain *iommu_domain; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 658e29e0f49b..5113c080f0c4 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -20,8 +20,12 @@ #include <linux/pci.h> #include <linux/gfp.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #include <linux/scatterlist.h> #include <linux/iommu-helper.h> +#ifdef CONFIG_IOMMU_API +#include <linux/iommu.h> +#endif #include <asm/proto.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -38,6 +42,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); static LIST_HEAD(iommu_pd_list); static DEFINE_SPINLOCK(iommu_pd_list_lock); +#ifdef CONFIG_IOMMU_API +static struct iommu_ops amd_iommu_ops; +#endif + /* * general struct to manage commands send to an IOMMU */ @@ -47,6 +55,68 @@ struct iommu_cmd { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +static struct dma_ops_domain *find_protection_domain(u16 devid); + + +#ifdef CONFIG_AMD_IOMMU_STATS + +/* + * Initialization code for statistics collection + */ + +DECLARE_STATS_COUNTER(compl_wait); +DECLARE_STATS_COUNTER(cnt_map_single); +DECLARE_STATS_COUNTER(cnt_unmap_single); +DECLARE_STATS_COUNTER(cnt_map_sg); +DECLARE_STATS_COUNTER(cnt_unmap_sg); +DECLARE_STATS_COUNTER(cnt_alloc_coherent); +DECLARE_STATS_COUNTER(cnt_free_coherent); +DECLARE_STATS_COUNTER(cross_page); +DECLARE_STATS_COUNTER(domain_flush_single); +DECLARE_STATS_COUNTER(domain_flush_all); +DECLARE_STATS_COUNTER(alloced_io_mem); +DECLARE_STATS_COUNTER(total_map_requests); + +static struct dentry *stats_dir; +static struct dentry *de_isolate; +static struct dentry *de_fflush; + +static void amd_iommu_stats_add(struct __iommu_counter *cnt) +{ + if (stats_dir == NULL) + return; + + cnt->dent = debugfs_create_u64(cnt->name, 0444, stats_dir, + &cnt->value); +} + +static void amd_iommu_stats_init(void) +{ + stats_dir = debugfs_create_dir("amd-iommu", NULL); + if (stats_dir == NULL) + return; + + de_isolate = debugfs_create_bool("isolation", 0444, stats_dir, + (u32 *)&amd_iommu_isolate); + + de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, + (u32 *)&amd_iommu_unmap_flush); + + amd_iommu_stats_add(&compl_wait); + amd_iommu_stats_add(&cnt_map_single); + amd_iommu_stats_add(&cnt_unmap_single); + amd_iommu_stats_add(&cnt_map_sg); + amd_iommu_stats_add(&cnt_unmap_sg); + amd_iommu_stats_add(&cnt_alloc_coherent); + amd_iommu_stats_add(&cnt_free_coherent); + amd_iommu_stats_add(&cross_page); + amd_iommu_stats_add(&domain_flush_single); + amd_iommu_stats_add(&domain_flush_all); + amd_iommu_stats_add(&alloced_io_mem); + amd_iommu_stats_add(&total_map_requests); +} + +#endif /* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) @@ -189,13 +259,55 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command(iommu, cmd); if (!ret) - iommu->need_sync = 1; + iommu->need_sync = true; spin_unlock_irqrestore(&iommu->lock, flags); return ret; } /* + * This function waits until an IOMMU has completed a completion + * wait command + */ +static void __iommu_wait_for_completion(struct amd_iommu *iommu) +{ + int ready = 0; + unsigned status = 0; + unsigned long i = 0; + + INC_STATS_COUNTER(compl_wait); + + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; + } + + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + + if (unlikely(i == EXIT_LOOP_COUNT)) + panic("AMD IOMMU: Completion wait loop failed\n"); +} + +/* + * This function queues a completion wait command into the command + * buffer of an IOMMU + */ +static int __iommu_completion_wait(struct amd_iommu *iommu) +{ + struct iommu_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + return __iommu_queue_command(iommu, &cmd); +} + +/* * This function is called whenever we need to ensure that the IOMMU has * completed execution of all commands we sent. It sends a * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs @@ -204,40 +316,22 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret = 0, ready = 0; - unsigned status = 0; - struct iommu_cmd cmd; - unsigned long flags, i = 0; - - memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; - CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + int ret = 0; + unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); if (!iommu->need_sync) goto out; - iommu->need_sync = 0; + ret = __iommu_completion_wait(iommu); - ret = __iommu_queue_command(iommu, &cmd); + iommu->need_sync = false; if (ret) goto out; - while (!ready && (i < EXIT_LOOP_COUNT)) { - ++i; - /* wait for the bit to become one */ - status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); - ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; - } - - /* set bit back to zero */ - status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; - writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); - - if (unlikely(i == EXIT_LOOP_COUNT)) - panic("AMD IOMMU: Completion wait loop failed\n"); + __iommu_wait_for_completion(iommu); out: spin_unlock_irqrestore(&iommu->lock, flags); @@ -264,6 +358,21 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return ret; } +static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, + u16 domid, int pde, int s) +{ + memset(cmd, 0, sizeof(*cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); + cmd->data[1] |= domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + if (s) /* size bit - we flush more than one 4kb page */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; +} + /* * Generic command send function for invalidaing TLB entries */ @@ -273,16 +382,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, struct iommu_cmd cmd; int ret; - memset(&cmd, 0, sizeof(cmd)); - address &= PAGE_MASK; - CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); - cmd.data[1] |= domid; - cmd.data[2] = lower_32_bits(address); - cmd.data[3] = upper_32_bits(address); - if (s) /* size bit - we flush more than one 4kb page */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ - cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); ret = iommu_queue_command(iommu, &cmd); @@ -321,9 +421,35 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) { u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + INC_STATS_COUNTER(domain_flush_single); + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); } +/* + * This function is used to flush the IO/TLB for a given protection domain + * on every IOMMU in the system + */ +static void iommu_flush_domain(u16 domid) +{ + unsigned long flags; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(domain_flush_all); + + __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + domid, 1, 1); + + list_for_each_entry(iommu, &amd_iommu_list, list) { + spin_lock_irqsave(&iommu->lock, flags); + __iommu_queue_command(iommu, &cmd); + __iommu_completion_wait(iommu); + __iommu_wait_for_completion(iommu); + spin_unlock_irqrestore(&iommu->lock, flags); + } +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -338,10 +464,10 @@ static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) * supporting all features of AMD IOMMU page tables like level skipping * and full 64 bit address spaces. */ -static int iommu_map(struct protection_domain *dom, - unsigned long bus_addr, - unsigned long phys_addr, - int prot) +static int iommu_map_page(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) { u64 __pte, *pte, *page; @@ -388,6 +514,28 @@ static int iommu_map(struct protection_domain *dom, return 0; } +static void iommu_unmap_page(struct protection_domain *dom, + unsigned long bus_addr) +{ + u64 *pte; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + *pte = 0; +} + /* * This function checks if a specific unity mapping entry is needed for * this specific IOMMU. @@ -440,7 +588,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { - ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot); if (ret) return ret; /* @@ -571,6 +719,16 @@ static u16 domain_id_alloc(void) return id; } +static void domain_id_free(int id) +{ + unsigned long flags; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + if (id > 0 && id < MAX_DOMAIN_ID) + __clear_bit(id, amd_iommu_pd_alloc_bitmap); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + /* * Used to reserve address ranges in the aperture (e.g. for exclusion * ranges. @@ -587,12 +745,12 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, iommu_area_reserve(dom->bitmap, start_page, pages); } -static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) +static void free_pagetable(struct protection_domain *domain) { int i, j; u64 *p1, *p2, *p3; - p1 = dma_dom->domain.pt_root; + p1 = domain->pt_root; if (!p1) return; @@ -613,6 +771,8 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) } free_page((unsigned long)p1); + + domain->pt_root = NULL; } /* @@ -624,7 +784,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) if (!dom) return; - dma_ops_free_pagetable(dom); + free_pagetable(&dom->domain); kfree(dom->pte_pages); @@ -663,6 +823,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, goto free_dma_dom; dma_dom->domain.mode = PAGE_MODE_3_LEVEL; dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.flags = PD_DMA_OPS_MASK; dma_dom->domain.priv = dma_dom; if (!dma_dom->domain.pt_root) goto free_dma_dom; @@ -725,6 +886,15 @@ free_dma_dom: } /* + * little helper function to check whether a given protection domain is a + * dma_ops domain + */ +static bool dma_ops_domain(struct protection_domain *domain) +{ + return domain->flags & PD_DMA_OPS_MASK; +} + +/* * Find out the protection domain structure for a given PCI device. This * will give us the pointer to the page table root for example. */ @@ -744,14 +914,15 @@ static struct protection_domain *domain_for_device(u16 devid) * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware */ -static void set_device_domain(struct amd_iommu *iommu, - struct protection_domain *domain, - u16 devid) +static void attach_device(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) { unsigned long flags; - u64 pte_root = virt_to_phys(domain->pt_root); + domain->dev_cnt += 1; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; @@ -767,6 +938,116 @@ static void set_device_domain(struct amd_iommu *iommu, iommu_queue_inv_dev_entry(iommu, devid); } +/* + * Removes a device from a protection domain (unlocked) + */ +static void __detach_device(struct protection_domain *domain, u16 devid) +{ + + /* lock domain */ + spin_lock(&domain->lock); + + /* remove domain from the lookup table */ + amd_iommu_pd_table[devid] = NULL; + + /* remove entry from the device table seen by the hardware */ + amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; + amd_iommu_dev_table[devid].data[1] = 0; + amd_iommu_dev_table[devid].data[2] = 0; + + /* decrease reference counter */ + domain->dev_cnt -= 1; + + /* ready */ + spin_unlock(&domain->lock); +} + +/* + * Removes a device from a protection domain (with devtable_lock held) + */ +static void detach_device(struct protection_domain *domain, u16 devid) +{ + unsigned long flags; + + /* lock device table */ + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + __detach_device(domain, devid); + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int device_change_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid = calc_devid(pdev->bus->number, pdev->devfn); + struct protection_domain *domain; + struct dma_ops_domain *dma_domain; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + unsigned long flags; + + if (devid > amd_iommu_last_bdf) + goto out; + + devid = amd_iommu_alias_table[devid]; + + iommu = amd_iommu_rlookup_table[devid]; + if (iommu == NULL) + goto out; + + domain = domain_for_device(devid); + + if (domain && !dma_ops_domain(domain)) + WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " + "to a non-dma-ops domain\n", dev_name(dev)); + + switch (action) { + case BUS_NOTIFY_BOUND_DRIVER: + if (domain) + goto out; + dma_domain = find_protection_domain(devid); + if (!dma_domain) + dma_domain = iommu->default_dom; + attach_device(iommu, &dma_domain->domain, devid); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device %s\n", dma_domain->domain.id, dev_name(dev)); + break; + case BUS_NOTIFY_UNBIND_DRIVER: + if (!domain) + goto out; + detach_device(domain, devid); + break; + case BUS_NOTIFY_ADD_DEVICE: + /* allocate a protection domain if a device is added */ + dma_domain = find_protection_domain(devid); + if (dma_domain) + goto out; + dma_domain = dma_ops_domain_alloc(iommu, order); + if (!dma_domain) + goto out; + dma_domain->target_dev = devid; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + list_add_tail(&dma_domain->list, &iommu_pd_list); + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + break; + default: + goto out; + } + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); + +out: + return 0; +} + +struct notifier_block device_nb = { + .notifier_call = device_change_notifier, +}; + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -802,7 +1083,6 @@ static struct dma_ops_domain *find_protection_domain(u16 devid) list_for_each_entry(entry, &iommu_pd_list, list) { if (entry->target_dev == devid) { ret = entry; - list_del(&ret->list); break; } } @@ -853,14 +1133,13 @@ static int get_device_resources(struct device *dev, if (!dma_dom) dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; - set_device_domain(*iommu, *domain, *bdf); + attach_device(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " - "device ", (*domain)->id); - print_devid(_bdf, 1); + "device %s\n", (*domain)->id, dev_name(dev)); } if (domain_for_device(_bdf) == NULL) - set_device_domain(*iommu, *domain, _bdf); + attach_device(*iommu, *domain, _bdf); return 1; } @@ -946,6 +1225,11 @@ static dma_addr_t __map_single(struct device *dev, pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; + INC_STATS_COUNTER(total_map_requests); + + if (pages > 1) + INC_STATS_COUNTER(cross_page); + if (align) align_mask = (1UL << get_order(size)) - 1; @@ -962,6 +1246,8 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + ADD_STATS_COUNTER(alloced_io_mem, size); + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; @@ -998,6 +1284,8 @@ static void __unmap_single(struct amd_iommu *iommu, start += PAGE_SIZE; } + SUB_STATS_COUNTER(alloced_io_mem, size); + dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { @@ -1019,6 +1307,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, dma_addr_t addr; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_single); + if (!check_device(dev)) return bad_dma_address; @@ -1030,6 +1320,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; + if (!dma_ops_domain(domain)) + return bad_dma_address; + spin_lock_irqsave(&domain->lock, flags); addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, dma_mask); @@ -1055,11 +1348,16 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_unmap_single); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, dir); @@ -1104,6 +1402,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, int mapped_elems = 0; u64 dma_mask; + INC_STATS_COUNTER(cnt_map_sg); + if (!check_device(dev)) return 0; @@ -1114,6 +1414,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, if (!iommu || !domain) return map_sg_no_iommu(dev, sglist, nelems, dir); + if (!dma_ops_domain(domain)) + return 0; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1163,10 +1466,15 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; + INC_STATS_COUNTER(cnt_unmap_sg); + if (!check_device(dev) || !get_device_resources(dev, &iommu, &domain, &devid)) return; + if (!dma_ops_domain(domain)) + return; + spin_lock_irqsave(&domain->lock, flags); for_each_sg(sglist, s, nelems, i) { @@ -1194,6 +1502,8 @@ static void *alloc_coherent(struct device *dev, size_t size, phys_addr_t paddr; u64 dma_mask = dev->coherent_dma_mask; + INC_STATS_COUNTER(cnt_alloc_coherent); + if (!check_device(dev)) return NULL; @@ -1212,6 +1522,9 @@ static void *alloc_coherent(struct device *dev, size_t size, return virt_addr; } + if (!dma_ops_domain(domain)) + goto out_free; + if (!dma_mask) dma_mask = *dev->dma_mask; @@ -1220,18 +1533,20 @@ static void *alloc_coherent(struct device *dev, size_t size, *dma_addr = __map_single(dev, iommu, domain->priv, paddr, size, DMA_BIDIRECTIONAL, true, dma_mask); - if (*dma_addr == bad_dma_address) { - free_pages((unsigned long)virt_addr, get_order(size)); - virt_addr = NULL; - goto out; - } + if (*dma_addr == bad_dma_address) + goto out_free; iommu_completion_wait(iommu); -out: spin_unlock_irqrestore(&domain->lock, flags); return virt_addr; + +out_free: + + free_pages((unsigned long)virt_addr, get_order(size)); + + return NULL; } /* @@ -1245,6 +1560,8 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + INC_STATS_COUNTER(cnt_free_coherent); + if (!check_device(dev)) return; @@ -1253,6 +1570,9 @@ static void free_coherent(struct device *dev, size_t size, if (!iommu || !domain) goto free_mem; + if (!dma_ops_domain(domain)) + goto free_mem; + spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); @@ -1305,7 +1625,7 @@ static void prealloc_protection_domains(void) u16 devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = (dev->bus->number << 8) | dev->devfn; + devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -1352,6 +1672,7 @@ int __init amd_iommu_init_dma_ops(void) iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) return -ENOMEM; + iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; ret = iommu_init_unity_mappings(iommu); if (ret) goto free_domains; @@ -1375,6 +1696,12 @@ int __init amd_iommu_init_dma_ops(void) /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; + register_iommu(&amd_iommu_ops); + + bus_register_notifier(&pci_bus_type, &device_nb); + + amd_iommu_stats_init(); + return 0; free_domains: @@ -1386,3 +1713,224 @@ free_domains: return ret; } + +/***************************************************************************** + * + * The following functions belong to the exported interface of AMD IOMMU + * + * This interface allows access to lower level functions of the IOMMU + * like protection domain handling and assignement of devices to domains + * which is not possible with the dma_ops interface. + * + *****************************************************************************/ + +static void cleanup_domain(struct protection_domain *domain) +{ + unsigned long flags; + u16 devid; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) + if (amd_iommu_pd_table[devid] == domain) + __detach_device(domain, devid); + + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); +} + +static int amd_iommu_domain_init(struct iommu_domain *dom) +{ + struct protection_domain *domain; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return -ENOMEM; + + spin_lock_init(&domain->lock); + domain->mode = PAGE_MODE_3_LEVEL; + domain->id = domain_id_alloc(); + if (!domain->id) + goto out_free; + domain->pt_root = (void *)get_zeroed_page(GFP_KERNEL); + if (!domain->pt_root) + goto out_free; + + dom->priv = domain; + + return 0; + +out_free: + kfree(domain); + + return -ENOMEM; +} + +static void amd_iommu_domain_destroy(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + + if (!domain) + return; + + if (domain->dev_cnt > 0) + cleanup_domain(domain); + + BUG_ON(domain->dev_cnt != 0); + + free_pagetable(domain); + + domain_id_free(domain->id); + + kfree(domain); + + dom->priv = NULL; +} + +static void amd_iommu_detach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid > 0) + detach_device(domain, devid); + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return; + + iommu_queue_inv_dev_entry(iommu, devid); + iommu_completion_wait(iommu); +} + +static int amd_iommu_attach_device(struct iommu_domain *dom, + struct device *dev) +{ + struct protection_domain *domain = dom->priv; + struct protection_domain *old_domain; + struct amd_iommu *iommu; + struct pci_dev *pdev; + u16 devid; + + if (dev->bus != &pci_bus_type) + return -EINVAL; + + pdev = to_pci_dev(dev); + + devid = calc_devid(pdev->bus->number, pdev->devfn); + + if (devid >= amd_iommu_last_bdf || + devid != amd_iommu_alias_table[devid]) + return -EINVAL; + + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + return -EINVAL; + + old_domain = domain_for_device(devid); + if (old_domain) + return -EBUSY; + + attach_device(iommu, domain, devid); + + iommu_completion_wait(iommu); + + return 0; +} + +static int amd_iommu_map_range(struct iommu_domain *dom, + unsigned long iova, phys_addr_t paddr, + size_t size, int iommu_prot) +{ + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); + int prot = 0; + int ret; + + if (iommu_prot & IOMMU_READ) + prot |= IOMMU_PROT_IR; + if (iommu_prot & IOMMU_WRITE) + prot |= IOMMU_PROT_IW; + + iova &= PAGE_MASK; + paddr &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + ret = iommu_map_page(domain, iova, paddr, prot); + if (ret) + return ret; + + iova += PAGE_SIZE; + paddr += PAGE_SIZE; + } + + return 0; +} + +static void amd_iommu_unmap_range(struct iommu_domain *dom, + unsigned long iova, size_t size) +{ + + struct protection_domain *domain = dom->priv; + unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); + + iova &= PAGE_MASK; + + for (i = 0; i < npages; ++i) { + iommu_unmap_page(domain, iova); + iova += PAGE_SIZE; + } + + iommu_flush_domain(domain->id); +} + +static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, + unsigned long iova) +{ + struct protection_domain *domain = dom->priv; + unsigned long offset = iova & ~PAGE_MASK; + phys_addr_t paddr; + u64 *pte; + + pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(iova)]; + + if (!IOMMU_PTE_PRESENT(*pte)) + return 0; + + paddr = *pte & IOMMU_PAGE_MASK; + paddr |= offset; + + return paddr; +} + +static struct iommu_ops amd_iommu_ops = { + .domain_init = amd_iommu_domain_init, + .domain_destroy = amd_iommu_domain_destroy, + .attach_dev = amd_iommu_attach_device, + .detach_dev = amd_iommu_detach_device, + .map = amd_iommu_map_range, + .unmap = amd_iommu_unmap_range, + .iova_to_phys = amd_iommu_iova_to_phys, +}; + diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index fb85e8d466cc..42c33cebf00f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -122,7 +122,8 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ -int amd_iommu_isolate = 1; /* if 1, device isolation is enabled */ +bool amd_iommu_isolate = true; /* if true, device isolation is + enabled */ bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the @@ -245,12 +246,8 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ static void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " - "at %02x:%02x.%x cap 0x%hx\n", - iommu->dev->bus->number, - PCI_SLOT(iommu->dev->devfn), - PCI_FUNC(iommu->dev->devfn), - iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n", + dev_name(&iommu->dev->dev), iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } @@ -1218,9 +1215,9 @@ static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) - amd_iommu_isolate = 1; + amd_iommu_isolate = true; if (strncmp(str, "share", 5) == 0) - amd_iommu_isolate = 0; + amd_iommu_isolate = false; if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; } diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c02343594b4d..d3ec292f00f2 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,8 +7,8 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif -ifeq ($(CONFIG_DMAR),y) -common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +ifeq ($(CONFIG_IOMMU_API),y) +common-objs += $(addprefix ../../../virt/kvm/, iommu.o) endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e6aa8141dcd..cc17546a2406 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -34,6 +34,7 @@ #include <linux/module.h> #include <linux/mman.h> #include <linux/highmem.h> +#include <linux/iommu.h> #include <linux/intel-iommu.h> #include <asm/uaccess.h> @@ -989,7 +990,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = !tdp_enabled; break; case KVM_CAP_IOMMU: - r = intel_iommu_found(); + r = iommu_found(); break; default: r = 0; |