diff options
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/vfio/Kconfig | 1 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 6 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 656 | ||||
-rw-r--r-- | drivers/xen/events/events_base.c | 83 | ||||
-rw-r--r-- | drivers/xen/events/events_internal.h | 1 | ||||
-rw-r--r-- | drivers/xen/grant-table.c | 73 | ||||
-rw-r--r-- | drivers/xen/manage.c | 16 | ||||
-rw-r--r-- | drivers/xen/pcpu.c | 1 | ||||
-rw-r--r-- | drivers/xen/platform-pci.c | 2 | ||||
-rw-r--r-- | drivers/xen/xen-acpi-processor.c | 15 | ||||
-rw-r--r-- | drivers/xen/xen-pciback/pciback_ops.c | 3 | ||||
-rw-r--r-- | drivers/xen/xen-selfballoon.c | 1 | ||||
-rw-r--r-- | drivers/xen/xenbus/xenbus_client.c | 27 |
13 files changed, 438 insertions, 447 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index 26b3d9d1409f..af7b204b9215 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -13,6 +13,7 @@ menuconfig VFIO depends on IOMMU_API select VFIO_IOMMU_TYPE1 if X86 select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES) + select ANON_INODES help VFIO provides a framework for secure userspace device drivers. See Documentation/vfio.txt for more details. diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 21271d8df023..512f479d8a50 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -1413,6 +1413,12 @@ int vfio_external_user_iommu_id(struct vfio_group *group) } EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id); +long vfio_external_check_extension(struct vfio_group *group, unsigned long arg) +{ + return vfio_ioctl_check_extension(group->container, arg); +} +EXPORT_SYMBOL_GPL(vfio_external_check_extension); + /** * Module/class support */ diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 54af4e933695..6673e7be507f 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -30,7 +30,6 @@ #include <linux/iommu.h> #include <linux/module.h> #include <linux/mm.h> -#include <linux/pci.h> /* pci_bus_type */ #include <linux/rbtree.h> #include <linux/sched.h> #include <linux/slab.h> @@ -55,11 +54,17 @@ MODULE_PARM_DESC(disable_hugepages, "Disable VFIO IOMMU support for IOMMU hugepages."); struct vfio_iommu { - struct iommu_domain *domain; + struct list_head domain_list; struct mutex lock; struct rb_root dma_list; + bool v2; +}; + +struct vfio_domain { + struct iommu_domain *domain; + struct list_head next; struct list_head group_list; - bool cache; + int prot; /* IOMMU_CACHE */ }; struct vfio_dma { @@ -99,7 +104,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, return NULL; } -static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) +static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new) { struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; struct vfio_dma *dma; @@ -118,7 +123,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) rb_insert_color(&new->node, &iommu->dma_list); } -static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) +static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) { rb_erase(&old->node, &iommu->dma_list); } @@ -322,32 +327,39 @@ static long vfio_unpin_pages(unsigned long pfn, long npage, return unlocked; } -static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, - dma_addr_t iova, size_t *size) +static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) { - dma_addr_t start = iova, end = iova + *size; + dma_addr_t iova = dma->iova, end = dma->iova + dma->size; + struct vfio_domain *domain, *d; long unlocked = 0; + if (!dma->size) + return; + /* + * We use the IOMMU to track the physical addresses, otherwise we'd + * need a much more complicated tracking system. Unfortunately that + * means we need to use one of the iommu domains to figure out the + * pfns to unpin. The rest need to be unmapped in advance so we have + * no iommu translations remaining when the pages are unpinned. + */ + domain = d = list_first_entry(&iommu->domain_list, + struct vfio_domain, next); + + list_for_each_entry_continue(d, &iommu->domain_list, next) + iommu_unmap(d->domain, dma->iova, dma->size); + while (iova < end) { size_t unmapped; phys_addr_t phys; - /* - * We use the IOMMU to track the physical address. This - * saves us from having a lot more entries in our mapping - * tree. The downside is that we don't track the size - * used to do the mapping. We request unmap of a single - * page, but expect IOMMUs that support large pages to - * unmap a larger chunk. - */ - phys = iommu_iova_to_phys(iommu->domain, iova); + phys = iommu_iova_to_phys(domain->domain, iova); if (WARN_ON(!phys)) { iova += PAGE_SIZE; continue; } - unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); - if (!unmapped) + unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); + if (WARN_ON(!unmapped)) break; unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, @@ -357,119 +369,26 @@ static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, } vfio_lock_acct(-unlocked); - - *size = iova - start; - - return 0; } -static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t *size, struct vfio_dma *dma) +static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) { - size_t offset, overlap, tmp; - struct vfio_dma *split; - int ret; - - if (!*size) - return 0; - - /* - * Existing dma region is completely covered, unmap all. This is - * the likely case since userspace tends to map and unmap buffers - * in one shot rather than multiple mappings within a buffer. - */ - if (likely(start <= dma->iova && - start + *size >= dma->iova + dma->size)) { - *size = dma->size; - ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); - if (ret) - return ret; - - /* - * Did we remove more than we have? Should never happen - * since a vfio_dma is contiguous in iova and vaddr. - */ - WARN_ON(*size != dma->size); - - vfio_remove_dma(iommu, dma); - kfree(dma); - return 0; - } - - /* Overlap low address of existing range */ - if (start <= dma->iova) { - overlap = start + *size - dma->iova; - ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); - if (ret) - return ret; - - vfio_remove_dma(iommu, dma); - - /* - * Check, we may have removed to whole vfio_dma. If not - * fixup and re-insert. - */ - if (overlap < dma->size) { - dma->iova += overlap; - dma->vaddr += overlap; - dma->size -= overlap; - vfio_insert_dma(iommu, dma); - } else - kfree(dma); - - *size = overlap; - return 0; - } - - /* Overlap high address of existing range */ - if (start + *size >= dma->iova + dma->size) { - offset = start - dma->iova; - overlap = dma->size - offset; - - ret = vfio_unmap_unpin(iommu, dma, start, &overlap); - if (ret) - return ret; - - dma->size -= overlap; - *size = overlap; - return 0; - } - - /* Split existing */ - - /* - * Allocate our tracking structure early even though it may not - * be used. An Allocation failure later loses track of pages and - * is more difficult to unwind. - */ - split = kzalloc(sizeof(*split), GFP_KERNEL); - if (!split) - return -ENOMEM; - - offset = start - dma->iova; - - ret = vfio_unmap_unpin(iommu, dma, start, size); - if (ret || !*size) { - kfree(split); - return ret; - } - - tmp = dma->size; + vfio_unmap_unpin(iommu, dma); + vfio_unlink_dma(iommu, dma); + kfree(dma); +} - /* Resize the lower vfio_dma in place, before the below insert */ - dma->size = offset; +static unsigned long vfio_pgsize_bitmap(struct vfio_iommu *iommu) +{ + struct vfio_domain *domain; + unsigned long bitmap = PAGE_MASK; - /* Insert new for remainder, assuming it didn't all get unmapped */ - if (likely(offset + *size < tmp)) { - split->size = tmp - offset - *size; - split->iova = dma->iova + offset + *size; - split->vaddr = dma->vaddr + offset + *size; - split->prot = dma->prot; - vfio_insert_dma(iommu, split); - } else - kfree(split); + mutex_lock(&iommu->lock); + list_for_each_entry(domain, &iommu->domain_list, next) + bitmap &= domain->domain->ops->pgsize_bitmap; + mutex_unlock(&iommu->lock); - return 0; + return bitmap; } static int vfio_dma_do_unmap(struct vfio_iommu *iommu, @@ -477,10 +396,10 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, { uint64_t mask; struct vfio_dma *dma; - size_t unmapped = 0, size; + size_t unmapped = 0; int ret = 0; - mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; if (unmap->iova & mask) return -EINVAL; @@ -491,20 +410,61 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); + /* + * vfio-iommu-type1 (v1) - User mappings were coalesced together to + * avoid tracking individual mappings. This means that the granularity + * of the original mapping was lost and the user was allowed to attempt + * to unmap any range. Depending on the contiguousness of physical + * memory and page sizes supported by the IOMMU, arbitrary unmaps may + * or may not have worked. We only guaranteed unmap granularity + * matching the original mapping; even though it was untracked here, + * the original mappings are reflected in IOMMU mappings. This + * resulted in a couple unusual behaviors. First, if a range is not + * able to be unmapped, ex. a set of 4k pages that was mapped as a + * 2M hugepage into the IOMMU, the unmap ioctl returns success but with + * a zero sized unmap. Also, if an unmap request overlaps the first + * address of a hugepage, the IOMMU will unmap the entire hugepage. + * This also returns success and the returned unmap size reflects the + * actual size unmapped. + * + * We attempt to maintain compatibility with this "v1" interface, but + * we take control out of the hands of the IOMMU. Therefore, an unmap + * request offset from the beginning of the original mapping will + * return success with zero sized unmap. And an unmap request covering + * the first iova of mapping will unmap the entire range. + * + * The v2 version of this interface intends to be more deterministic. + * Unmap requests must fully cover previous mappings. Multiple + * mappings may still be unmaped by specifying large ranges, but there + * must not be any previous mappings bisected by the range. An error + * will be returned if these conditions are not met. The v2 interface + * will only return success and a size of zero if there were no + * mappings within the range. + */ + if (iommu->v2) { + dma = vfio_find_dma(iommu, unmap->iova, 0); + if (dma && dma->iova != unmap->iova) { + ret = -EINVAL; + goto unlock; + } + dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0); + if (dma && dma->iova + dma->size != unmap->iova + unmap->size) { + ret = -EINVAL; + goto unlock; + } + } + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { - size = unmap->size; - ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); - if (ret || !size) + if (!iommu->v2 && unmap->iova > dma->iova) break; - unmapped += size; + unmapped += dma->size; + vfio_remove_dma(iommu, dma); } +unlock: mutex_unlock(&iommu->lock); - /* - * We may unmap more than requested, update the unmap struct so - * userspace can know. - */ + /* Report how much was unmapped */ unmap->size = unmapped; return ret; @@ -516,22 +476,47 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, * soon, so this is just a temporary workaround to break mappings down into * PAGE_SIZE. Better to map smaller pages than nothing. */ -static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, +static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova, unsigned long pfn, long npage, int prot) { long i; int ret; for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { - ret = iommu_map(iommu->domain, iova, + ret = iommu_map(domain->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, - PAGE_SIZE, prot); + PAGE_SIZE, prot | domain->prot); if (ret) break; } for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) - iommu_unmap(iommu->domain, iova, PAGE_SIZE); + iommu_unmap(domain->domain, iova, PAGE_SIZE); + + return ret; +} + +static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + struct vfio_domain *d; + int ret; + + list_for_each_entry(d, &iommu->domain_list, next) { + ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot | d->prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(d, iova, pfn, npage, prot)) + goto unwind; + } + } + + return 0; + +unwind: + list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) + iommu_unmap(d->domain, iova, npage << PAGE_SHIFT); return ret; } @@ -545,12 +530,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, long npage; int ret = 0, prot = 0; uint64_t mask; - struct vfio_dma *dma = NULL; + struct vfio_dma *dma; unsigned long pfn; end = map->iova + map->size; - mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; + mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; /* READ/WRITE from device perspective */ if (map->flags & VFIO_DMA_MAP_FLAG_WRITE) @@ -561,9 +546,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!prot) return -EINVAL; /* No READ/WRITE? */ - if (iommu->cache) - prot |= IOMMU_CACHE; - if (vaddr & mask) return -EINVAL; if (map->iova & mask) @@ -588,180 +570,257 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, return -EEXIST; } - for (iova = map->iova; iova < end; iova += size, vaddr += size) { - long i; + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + mutex_unlock(&iommu->lock); + return -ENOMEM; + } + + dma->iova = map->iova; + dma->vaddr = map->vaddr; + dma->prot = prot; + + /* Insert zero-sized and grow as we map chunks of it */ + vfio_link_dma(iommu, dma); + for (iova = map->iova; iova < end; iova += size, vaddr += size) { /* Pin a contiguous chunk of memory */ npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, prot, &pfn); if (npage <= 0) { WARN_ON(!npage); ret = (int)npage; - goto out; - } - - /* Verify pages are not already mapped */ - for (i = 0; i < npage; i++) { - if (iommu_iova_to_phys(iommu->domain, - iova + (i << PAGE_SHIFT))) { - ret = -EBUSY; - goto out_unpin; - } + break; } - ret = iommu_map(iommu->domain, iova, - (phys_addr_t)pfn << PAGE_SHIFT, - npage << PAGE_SHIFT, prot); + /* Map it! */ + ret = vfio_iommu_map(iommu, iova, pfn, npage, prot); if (ret) { - if (ret != -EBUSY || - map_try_harder(iommu, iova, pfn, npage, prot)) { - goto out_unpin; - } + vfio_unpin_pages(pfn, npage, prot, true); + break; } size = npage << PAGE_SHIFT; + dma->size += size; + } - /* - * Check if we abut a region below - nothing below 0. - * This is the most likely case when mapping chunks of - * physically contiguous regions within a virtual address - * range. Update the abutting entry in place since iova - * doesn't change. - */ - if (likely(iova)) { - struct vfio_dma *tmp; - tmp = vfio_find_dma(iommu, iova - 1, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr + tmp->size == vaddr) { - tmp->size += size; - iova = tmp->iova; - size = tmp->size; - vaddr = tmp->vaddr; - dma = tmp; - } - } + if (ret) + vfio_remove_dma(iommu, dma); - /* - * Check if we abut a region above - nothing above ~0 + 1. - * If we abut above and below, remove and free. If only - * abut above, remove, modify, reinsert. - */ - if (likely(iova + size)) { - struct vfio_dma *tmp; - tmp = vfio_find_dma(iommu, iova + size, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr == vaddr + size) { - vfio_remove_dma(iommu, tmp); - if (dma) { - dma->size += tmp->size; - kfree(tmp); - } else { - size += tmp->size; - tmp->size = size; - tmp->iova = iova; - tmp->vaddr = vaddr; - vfio_insert_dma(iommu, tmp); - dma = tmp; - } - } - } + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_bus_type(struct device *dev, void *data) +{ + struct bus_type **bus = data; + + if (*bus && *bus != dev->bus) + return -EINVAL; + + *bus = dev->bus; + + return 0; +} + +static int vfio_iommu_replay(struct vfio_iommu *iommu, + struct vfio_domain *domain) +{ + struct vfio_domain *d; + struct rb_node *n; + int ret; + + /* Arbitrarily pick the first domain in the list for lookups */ + d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); + n = rb_first(&iommu->dma_list); + + /* If there's not a domain, there better not be any mappings */ + if (WARN_ON(n && !d)) + return -EINVAL; + + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + dma_addr_t iova; + + dma = rb_entry(n, struct vfio_dma, node); + iova = dma->iova; + + while (iova < dma->iova + dma->size) { + phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); + size_t size; - if (!dma) { - dma = kzalloc(sizeof(*dma), GFP_KERNEL); - if (!dma) { - iommu_unmap(iommu->domain, iova, size); - ret = -ENOMEM; - goto out_unpin; + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; } - dma->size = size; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - vfio_insert_dma(iommu, dma); - } - } + size = PAGE_SIZE; - WARN_ON(ret); - mutex_unlock(&iommu->lock); - return ret; + while (iova + size < dma->iova + dma->size && + phys + size == iommu_iova_to_phys(d->domain, + iova + size)) + size += PAGE_SIZE; -out_unpin: - vfio_unpin_pages(pfn, npage, prot, true); + ret = iommu_map(domain->domain, iova, phys, + size, dma->prot | domain->prot); + if (ret) + return ret; -out: - iova = map->iova; - size = map->size; - while ((dma = vfio_find_dma(iommu, iova, size))) { - int r = vfio_remove_dma_overlap(iommu, iova, - &size, dma); - if (WARN_ON(r || !size)) - break; + iova += size; + } } - mutex_unlock(&iommu->lock); - return ret; + return 0; } static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { struct vfio_iommu *iommu = iommu_data; - struct vfio_group *group, *tmp; + struct vfio_group *group, *g; + struct vfio_domain *domain, *d; + struct bus_type *bus = NULL; int ret; - group = kzalloc(sizeof(*group), GFP_KERNEL); - if (!group) - return -ENOMEM; - mutex_lock(&iommu->lock); - list_for_each_entry(tmp, &iommu->group_list, next) { - if (tmp->iommu_group == iommu_group) { + list_for_each_entry(d, &iommu->domain_list, next) { + list_for_each_entry(g, &d->group_list, next) { + if (g->iommu_group != iommu_group) + continue; + mutex_unlock(&iommu->lock); - kfree(group); return -EINVAL; } } + group = kzalloc(sizeof(*group), GFP_KERNEL); + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!group || !domain) { + ret = -ENOMEM; + goto out_free; + } + + group->iommu_group = iommu_group; + + /* Determine bus_type in order to allocate a domain */ + ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type); + if (ret) + goto out_free; + + domain->domain = iommu_domain_alloc(bus); + if (!domain->domain) { + ret = -EIO; + goto out_free; + } + + ret = iommu_attach_group(domain->domain, iommu_group); + if (ret) + goto out_domain; + + INIT_LIST_HEAD(&domain->group_list); + list_add(&group->next, &domain->group_list); + + if (!allow_unsafe_interrupts && + !iommu_domain_has_cap(domain->domain, IOMMU_CAP_INTR_REMAP)) { + pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", + __func__); + ret = -EPERM; + goto out_detach; + } + + if (iommu_domain_has_cap(domain->domain, IOMMU_CAP_CACHE_COHERENCY)) + domain->prot |= IOMMU_CACHE; + /* - * TODO: Domain have capabilities that might change as we add - * groups (see iommu->cache, currently never set). Check for - * them and potentially disallow groups to be attached when it - * would change capabilities (ugh). + * Try to match an existing compatible domain. We don't want to + * preclude an IOMMU driver supporting multiple bus_types and being + * able to include different bus_types in the same IOMMU domain, so + * we test whether the domains use the same iommu_ops rather than + * testing if they're on the same bus_type. */ - ret = iommu_attach_group(iommu->domain, iommu_group); - if (ret) { - mutex_unlock(&iommu->lock); - kfree(group); - return ret; + list_for_each_entry(d, &iommu->domain_list, next) { + if (d->domain->ops == domain->domain->ops && + d->prot == domain->prot) { + iommu_detach_group(domain->domain, iommu_group); + if (!iommu_attach_group(d->domain, iommu_group)) { + list_add(&group->next, &d->group_list); + iommu_domain_free(domain->domain); + kfree(domain); + mutex_unlock(&iommu->lock); + return 0; + } + + ret = iommu_attach_group(domain->domain, iommu_group); + if (ret) + goto out_domain; + } } - group->iommu_group = iommu_group; - list_add(&group->next, &iommu->group_list); + /* replay mappings on new domains */ + ret = vfio_iommu_replay(iommu, domain); + if (ret) + goto out_detach; + + list_add(&domain->next, &iommu->domain_list); mutex_unlock(&iommu->lock); return 0; + +out_detach: + iommu_detach_group(domain->domain, iommu_group); +out_domain: + iommu_domain_free(domain->domain); +out_free: + kfree(domain); + kfree(group); + mutex_unlock(&iommu->lock); + return ret; +} + +static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) +{ + struct rb_node *node; + + while ((node = rb_first(&iommu->dma_list))) + vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); } static void vfio_iommu_type1_detach_group(void *iommu_data, struct iommu_group *iommu_group) { struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain; struct vfio_group *group; mutex_lock(&iommu->lock); - list_for_each_entry(group, &iommu->group_list, next) { - if (group->iommu_group == iommu_group) { - iommu_detach_group(iommu->domain, iommu_group); + list_for_each_entry(domain, &iommu->domain_list, next) { + list_for_each_entry(group, &domain->group_list, next) { + if (group->iommu_group != iommu_group) + continue; + + iommu_detach_group(domain->domain, iommu_group); list_del(&group->next); kfree(group); - break; + /* + * Group ownership provides privilege, if the group + * list is empty, the domain goes away. If it's the + * last domain, then all the mappings go away too. + */ + if (list_empty(&domain->group_list)) { + if (list_is_singular(&iommu->domain_list)) + vfio_iommu_unmap_unpin_all(iommu); + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); + } + goto done; } } +done: mutex_unlock(&iommu->lock); } @@ -769,40 +828,17 @@ static void *vfio_iommu_type1_open(unsigned long arg) { struct vfio_iommu *iommu; - if (arg != VFIO_TYPE1_IOMMU) + if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU) return ERR_PTR(-EINVAL); iommu = kzalloc(sizeof(*iommu), GFP_KERNEL); if (!iommu) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&iommu->group_list); + INIT_LIST_HEAD(&iommu->domain_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); - - /* - * Wish we didn't have to know about bus_type here. - */ - iommu->domain = iommu_domain_alloc(&pci_bus_type); - if (!iommu->domain) { - kfree(iommu); - return ERR_PTR(-EIO); - } - - /* - * Wish we could specify required capabilities rather than create - * a domain, see what comes out and hope it doesn't change along - * the way. Fortunately we know interrupt remapping is global for - * our iommus. - */ - if (!allow_unsafe_interrupts && - !iommu_domain_has_cap(iommu->domain, IOMMU_CAP_INTR_REMAP)) { - pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", - __func__); - iommu_domain_free(iommu->domain); - kfree(iommu); - return ERR_PTR(-EPERM); - } + iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU); return iommu; } @@ -810,26 +846,42 @@ static void *vfio_iommu_type1_open(unsigned long arg) static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; + struct vfio_domain *domain, *domain_tmp; struct vfio_group *group, *group_tmp; - struct rb_node *node; - list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { - iommu_detach_group(iommu->domain, group->iommu_group); - list_del(&group->next); - kfree(group); + vfio_iommu_unmap_unpin_all(iommu); + + list_for_each_entry_safe(domain, domain_tmp, + &iommu->domain_list, next) { + list_for_each_entry_safe(group, group_tmp, + &domain->group_list, next) { + iommu_detach_group(domain->domain, group->iommu_group); + list_del(&group->next); + kfree(group); + } + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); } - while ((node = rb_first(&iommu->dma_list))) { - struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); - size_t size = dma->size; - vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); - if (WARN_ON(!size)) + kfree(iommu); +} + +static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) +{ + struct vfio_domain *domain; + int ret = 1; + + mutex_lock(&iommu->lock); + list_for_each_entry(domain, &iommu->domain_list, next) { + if (!(domain->prot & IOMMU_CACHE)) { + ret = 0; break; + } } + mutex_unlock(&iommu->lock); - iommu_domain_free(iommu->domain); - iommu->domain = NULL; - kfree(iommu); + return ret; } static long vfio_iommu_type1_ioctl(void *iommu_data, @@ -841,7 +893,12 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (cmd == VFIO_CHECK_EXTENSION) { switch (arg) { case VFIO_TYPE1_IOMMU: + case VFIO_TYPE1v2_IOMMU: return 1; + case VFIO_DMA_CC_IOMMU: + if (!iommu) + return 0; + return vfio_domains_have_iommu_cache(iommu); default: return 0; } @@ -858,7 +915,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, info.flags = 0; - info.iova_pgsizes = iommu->domain->ops->pgsize_bitmap; + info.iova_pgsizes = vfio_pgsize_bitmap(iommu); return copy_to_user((void __user *)arg, &info, minsz); @@ -911,9 +968,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { static int __init vfio_iommu_type1_init(void) { - if (!iommu_present(&pci_bus_type)) - return -ENODEV; - return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1); } diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index c3458f58de90..d5a3de88ac59 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -388,10 +388,10 @@ static void xen_irq_init(unsigned irq) list_add_tail(&info->list, &xen_irq_list_head); } -static int __must_check xen_allocate_irq_dynamic(void) +static int __must_check xen_allocate_irqs_dynamic(int nvec) { int first = 0; - int irq; + int i, irq; #ifdef CONFIG_X86_IO_APIC /* @@ -405,14 +405,22 @@ static int __must_check xen_allocate_irq_dynamic(void) first = get_nr_irqs_gsi(); #endif - irq = irq_alloc_desc_from(first, -1); + irq = irq_alloc_descs_from(first, nvec, -1); - if (irq >= 0) - xen_irq_init(irq); + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } return irq; } +static inline int __must_check xen_allocate_irq_dynamic(void) +{ + + return xen_allocate_irqs_dynamic(1); +} + static int __must_check xen_allocate_irq_gsi(unsigned gsi) { int irq; @@ -466,9 +474,6 @@ static void xen_evtchn_close(unsigned int port) close.port = port; if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) BUG(); - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(port, 0); } static void pirq_query_unmask(int irq) @@ -730,22 +735,25 @@ int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) } int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, - int pirq, const char *name, domid_t domid) + int pirq, int nvec, const char *name, domid_t domid) { - int irq, ret; + int i, irq, ret; mutex_lock(&irq_mapping_update_lock); - irq = xen_allocate_irq_dynamic(); + irq = xen_allocate_irqs_dynamic(nvec); if (irq < 0) goto out; - irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, - name); + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } - ret = xen_irq_info_pirq_setup(irq, 0, pirq, 0, domid, 0); - if (ret < 0) - goto error_irq; ret = irq_set_msi_desc(irq, msidesc); if (ret < 0) goto error_irq; @@ -753,7 +761,8 @@ out: mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: - __unbind_from_irq(irq); + for (; i >= 0; i--) + __unbind_from_irq(irq + i); mutex_unlock(&irq_mapping_update_lock); return ret; } @@ -767,7 +776,12 @@ int xen_destroy_irq(int irq) mutex_lock(&irq_mapping_update_lock); - if (xen_initial_domain()) { + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { unmap_irq.pirq = info->u.pirq.pirq; unmap_irq.domid = info->u.pirq.domid; rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); @@ -1329,26 +1343,6 @@ static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, return rebind_irq_to_cpu(data->irq, tcpu); } -static int retrigger_evtchn(int evtchn) -{ - int masked; - - if (!VALID_EVTCHN(evtchn)) - return 0; - - masked = test_and_set_mask(evtchn); - set_evtchn(evtchn); - if (!masked) - unmask_evtchn(evtchn); - - return 1; -} - -int resend_irq_on_evtchn(unsigned int irq) -{ - return retrigger_evtchn(evtchn_from_irq(irq)); -} - static void enable_dynirq(struct irq_data *data) { int evtchn = evtchn_from_irq(data->irq); @@ -1383,7 +1377,18 @@ static void mask_ack_dynirq(struct irq_data *data) static int retrigger_dynirq(struct irq_data *data) { - return retrigger_evtchn(evtchn_from_irq(data->irq)); + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; + + if (!VALID_EVTCHN(evtchn)) + return 0; + + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); + + return 1; } static void restore_pirqs(void) diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h index 677f41a0fff9..50c2050a1e32 100644 --- a/drivers/xen/events/events_internal.h +++ b/drivers/xen/events/events_internal.h @@ -53,6 +53,7 @@ struct irq_info { #define PIRQ_NEEDS_EOI (1 << 0) #define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) struct evtchn_ops { unsigned (*max_channels)(void); diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index b84e3ab839aa..6d325bda76da 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -933,9 +933,6 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, struct page **pages, unsigned int count) { int i, ret; - bool lazy = false; - pte_t *pte; - unsigned long mfn; ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); if (ret) @@ -947,45 +944,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, &map_ops[i].status, __func__); - /* this is basically a nop on x86 */ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - for (i = 0; i < count; i++) { - if (map_ops[i].status) - continue; - set_phys_to_machine(map_ops[i].host_addr >> PAGE_SHIFT, - map_ops[i].dev_bus_addr >> PAGE_SHIFT); - } - return ret; - } - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - /* Do not add to override if the map failed. */ - if (map_ops[i].status) - continue; - - if (map_ops[i].flags & GNTMAP_contains_pte) { - pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + - (map_ops[i].host_addr & ~PAGE_MASK)); - mfn = pte_mfn(*pte); - } else { - mfn = PFN_DOWN(map_ops[i].dev_bus_addr); - } - ret = m2p_add_override(mfn, pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - goto out; - } - - out: - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_map_refs); @@ -993,39 +952,13 @@ int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, struct gnttab_map_grant_ref *kmap_ops, struct page **pages, unsigned int count) { - int i, ret; - bool lazy = false; + int ret; ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); if (ret) return ret; - /* this is basically a nop on x86 */ - if (xen_feature(XENFEAT_auto_translated_physmap)) { - for (i = 0; i < count; i++) { - set_phys_to_machine(unmap_ops[i].host_addr >> PAGE_SHIFT, - INVALID_P2M_ENTRY); - } - return ret; - } - - if (!in_interrupt() && paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) { - arch_enter_lazy_mmu_mode(); - lazy = true; - } - - for (i = 0; i < count; i++) { - ret = m2p_remove_override(pages[i], kmap_ops ? - &kmap_ops[i] : NULL); - if (ret) - goto out; - } - - out: - if (lazy) - arch_leave_lazy_mmu_mode(); - - return ret; + return clear_foreign_p2m_mapping(unmap_ops, kmap_ops, pages, count); } EXPORT_SYMBOL_GPL(gnttab_unmap_refs); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 624e8dc24532..fc6c94c0b436 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -46,6 +46,20 @@ struct suspend_info { void (*post)(int cancelled); }; +static RAW_NOTIFIER_HEAD(xen_resume_notifier); + +void xen_resume_notifier_register(struct notifier_block *nb) +{ + raw_notifier_chain_register(&xen_resume_notifier, nb); +} +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); + +void xen_resume_notifier_unregister(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&xen_resume_notifier, nb); +} +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); + #ifdef CONFIG_HIBERNATE_CALLBACKS static void xen_hvm_post_suspend(int cancelled) { @@ -152,6 +166,8 @@ static void do_suspend(void) err = stop_machine(xen_suspend, &si, cpumask_of(0)); + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c index 79e1dff7ed4f..0aac403d53fd 100644 --- a/drivers/xen/pcpu.c +++ b/drivers/xen/pcpu.c @@ -40,6 +40,7 @@ #include <linux/capability.h> #include <xen/xen.h> +#include <xen/acpi.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/interface/platform.h> diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index a1361c312c06..3454973dc3bb 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -45,7 +45,7 @@ static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; static uint64_t callback_via; -unsigned long alloc_xen_mmio(unsigned long len) +static unsigned long alloc_xen_mmio(unsigned long len) { unsigned long addr; diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 7231859119f1..82358d14ecf1 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -27,10 +27,10 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> -#include <linux/syscore_ops.h> #include <linux/acpi.h> #include <acpi/processor.h> #include <xen/xen.h> +#include <xen/xen-ops.h> #include <xen/interface/platform.h> #include <asm/xen/hypercall.h> @@ -495,14 +495,15 @@ static int xen_upload_processor_pm_data(void) return rc; } -static void xen_acpi_processor_resume(void) +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) { bitmap_zero(acpi_ids_done, nr_acpi_bits); - xen_upload_processor_pm_data(); + return xen_upload_processor_pm_data(); } -static struct syscore_ops xap_syscore_ops = { - .resume = xen_acpi_processor_resume, +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, }; static int __init xen_acpi_processor_init(void) @@ -555,7 +556,7 @@ static int __init xen_acpi_processor_init(void) if (rc) goto err_unregister; - register_syscore_ops(&xap_syscore_ops); + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); return 0; err_unregister: @@ -574,7 +575,7 @@ static void __exit xen_acpi_processor_exit(void) { int i; - unregister_syscore_ops(&xap_syscore_ops); + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); kfree(acpi_ids_done); kfree(acpi_id_present); kfree(acpi_id_cst_present); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 64eb0cd8b8af..929dd46bb40c 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -213,8 +213,7 @@ int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, entries[i].vector = op->msix_entries[i].vector; } - result = pci_enable_msix(dev, entries, op->value); - + result = pci_enable_msix_exact(dev, entries, op->value); if (result == 0) { for (i = 0; i < op->value; i++) { op->msix_entries[i].entry = entries[i].entry; diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 745ad79c1d8e..3b2bffde534f 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -170,6 +170,7 @@ static void frontswap_selfshrink(void) tgt_frontswap_pages = cur_frontswap_pages - (cur_frontswap_pages / frontswap_hysteresis); frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; } #endif /* CONFIG_FRONTSWAP */ diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 01d59e66565d..439c9dca9eee 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -401,33 +401,6 @@ EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); /** - * Bind to an existing interdomain event channel in another domain. Returns 0 - * on success and stores the local port in *port. On error, returns -errno, - * switches the device to XenbusStateClosing, and saves the error in XenStore. - */ -int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port) -{ - struct evtchn_bind_interdomain bind_interdomain; - int err; - - bind_interdomain.remote_dom = dev->otherend_id; - bind_interdomain.remote_port = remote_port; - - err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, - &bind_interdomain); - if (err) - xenbus_dev_fatal(dev, err, - "binding to event channel %d from domain %d", - remote_port, dev->otherend_id); - else - *port = bind_interdomain.local_port; - - return err; -} -EXPORT_SYMBOL_GPL(xenbus_bind_evtchn); - - -/** * Free an existing event channel. Returns 0 on success or -errno on error. */ int xenbus_free_evtchn(struct xenbus_device *dev, int port) |