From cd9b22685e4ccd728550d51fbe108c473f89df4f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Jun 2013 09:37:50 -0600 Subject: vfio: Convert type1 iommu to use rbtree We need to keep track of all the DMA mappings of an iommu container so that it can be automatically unmapped when the user releases the file descriptor. We currently do this using a simple list, where we merge entries with contiguous iovas and virtual addresses. Using a tree for this is a bit more efficient and allows us to use common code instead of inventing our own. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 190 ++++++++++++++++++++-------------------- 1 file changed, 96 insertions(+), 94 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6f3fbc48a6c7..0e863b3ddcab 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -31,6 +31,7 @@ #include #include #include /* pci_bus_type */ +#include #include #include #include @@ -50,13 +51,13 @@ MODULE_PARM_DESC(allow_unsafe_interrupts, struct vfio_iommu { struct iommu_domain *domain; struct mutex lock; - struct list_head dma_list; + struct rb_root dma_list; struct list_head group_list; bool cache; }; struct vfio_dma { - struct list_head next; + struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ long npage; /* Number of pages */ @@ -75,6 +76,49 @@ struct vfio_group { #define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) +static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, + dma_addr_t start, size_t size) +{ + struct rb_node *node = iommu->dma_list.rb_node; + + while (node) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); + + if (start + size <= dma->iova) + node = node->rb_left; + else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + node = node->rb_right; + else + return dma; + } + + return NULL; +} + +static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) +{ + struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL; + struct vfio_dma *dma; + + while (*link) { + parent = *link; + dma = rb_entry(parent, struct vfio_dma, node); + + if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &iommu->dma_list); +} + +static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old) +{ + rb_erase(&old->node, &iommu->dma_list); +} + struct vwork { struct mm_struct *mm; long npage; @@ -289,31 +333,8 @@ static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, return 0; } -static inline bool ranges_overlap(dma_addr_t start1, size_t size1, - dma_addr_t start2, size_t size2) -{ - if (start1 < start2) - return (start2 - start1 < size1); - else if (start2 < start1) - return (start1 - start2 < size2); - return (size1 > 0 && size2 > 0); -} - -static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, - dma_addr_t start, size_t size) -{ - struct vfio_dma *dma; - - list_for_each_entry(dma, &iommu->dma_list, next) { - if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), - start, size)) - return dma; - } - return NULL; -} - -static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) +static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, + size_t size, struct vfio_dma *dma) { struct vfio_dma *split; long npage_lo, npage_hi; @@ -322,10 +343,9 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, if (start <= dma->iova && start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - list_del(&dma->next); - npage_lo = dma->npage; + vfio_remove_dma(iommu, dma); kfree(dma); - return npage_lo; + return 0; } /* Overlap low address of existing range */ @@ -339,7 +359,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, dma->iova += overlap; dma->vaddr += overlap; dma->npage -= npage_lo; - return npage_lo; + return 0; } /* Overlap high address of existing range */ @@ -351,7 +371,7 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, vfio_dma_unmap(iommu, start, npage_hi, dma->prot); dma->npage -= npage_hi; - return npage_hi; + return 0; } /* Split existing */ @@ -370,16 +390,16 @@ static long vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, split->iova = start + size; split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; split->prot = dma->prot; - list_add(&split->next, &iommu->dma_list); - return size >> PAGE_SHIFT; + vfio_insert_dma(iommu, split); + return 0; } static int vfio_dma_do_unmap(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_unmap *unmap) { - long ret = 0, npage = unmap->size >> PAGE_SHIFT; - struct vfio_dma *dma, *tmp; uint64_t mask; + struct vfio_dma *dma; + int ret = 0; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -393,25 +413,19 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); - list_for_each_entry_safe(dma, tmp, &iommu->dma_list, next) { - if (ranges_overlap(dma->iova, NPAGE_TO_SIZE(dma->npage), - unmap->iova, unmap->size)) { - ret = vfio_remove_dma_overlap(iommu, unmap->iova, - unmap->size, dma); - if (ret > 0) - npage -= ret; - if (ret < 0 || npage == 0) - break; - } - } + while (!ret && (dma = vfio_find_dma(iommu, + unmap->iova, unmap->size))) + ret = vfio_remove_dma_overlap(iommu, unmap->iova, + unmap->size, dma); + mutex_unlock(&iommu->lock); - return ret > 0 ? 0 : (int)ret; + return ret; } static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { - struct vfio_dma *dma, *pdma = NULL; + struct vfio_dma *dma; dma_addr_t iova = map->iova; unsigned long locked, lock_limit, vaddr = map->vaddr; size_t size = map->size; @@ -452,6 +466,10 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!npage) return -EINVAL; + dma = kzalloc(sizeof *dma, GFP_KERNEL); + if (!dma) + return -ENOMEM; + mutex_lock(&iommu->lock); if (vfio_find_dma(iommu, iova, size)) { @@ -473,62 +491,45 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (ret) goto out_lock; + dma->npage = npage; + dma->iova = iova; + dma->vaddr = vaddr; + dma->prot = prot; + /* Check if we abut a region below - nothing below 0 */ if (iova) { - dma = vfio_find_dma(iommu, iova - 1, 1); - if (dma && dma->prot == prot && - dma->vaddr + NPAGE_TO_SIZE(dma->npage) == vaddr) { - - dma->npage += npage; - iova = dma->iova; - vaddr = dma->vaddr; + struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) { + vfio_remove_dma(iommu, tmp); + dma->npage += tmp->npage; + dma->iova = iova = tmp->iova; + dma->vaddr = vaddr = tmp->vaddr; + kfree(tmp); npage = dma->npage; size = NPAGE_TO_SIZE(npage); - - pdma = dma; } } /* Check if we abut a region above - nothing above ~0 + 1 */ if (iova + size) { - dma = vfio_find_dma(iommu, iova + size, 1); - if (dma && dma->prot == prot && - dma->vaddr == vaddr + size) { - - dma->npage += npage; - dma->iova = iova; - dma->vaddr = vaddr; - - /* - * If merged above and below, remove previously - * merged entry. New entry covers it. - */ - if (pdma) { - list_del(&pdma->next); - kfree(pdma); - } - pdma = dma; + struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr == vaddr + size) { + vfio_remove_dma(iommu, tmp); + dma->npage += tmp->npage; + kfree(tmp); + npage = dma->npage; + size = NPAGE_TO_SIZE(npage); } } - /* Isolated, new region */ - if (!pdma) { - dma = kzalloc(sizeof *dma, GFP_KERNEL); - if (!dma) { - ret = -ENOMEM; - vfio_dma_unmap(iommu, iova, npage, prot); - goto out_lock; - } - - dma->npage = npage; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - list_add(&dma->next, &iommu->dma_list); - } + vfio_insert_dma(iommu, dma); out_lock: mutex_unlock(&iommu->lock); + if (ret) + kfree(dma); return ret; } @@ -606,7 +607,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&iommu->group_list); - INIT_LIST_HEAD(&iommu->dma_list); + iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); /* @@ -640,7 +641,7 @@ static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_group *group, *group_tmp; - struct vfio_dma *dma, *dma_tmp; + struct rb_node *node; list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) { iommu_detach_group(iommu->domain, group->iommu_group); @@ -648,9 +649,10 @@ static void vfio_iommu_type1_release(void *iommu_data) kfree(group); } - list_for_each_entry_safe(dma, dma_tmp, &iommu->dma_list, next) { + while ((node = rb_first(&iommu->dma_list))) { + struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - list_del(&dma->next); + vfio_remove_dma(iommu, dma); kfree(dma); } -- cgit v1.2.3 From 166fd7d94afdac040b28c473e45241820ca522a2 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Jun 2013 09:38:02 -0600 Subject: vfio: hugepage support for vfio_iommu_type1 We currently send all mappings to the iommu in PAGE_SIZE chunks, which prevents the iommu from enabling support for larger page sizes. We still need to pin pages, which means we step through them in PAGE_SIZE chunks, but we can batch up contiguous physical memory chunks to allow the iommu the opportunity to use larger pages. The approach here is a bit different that the one currently used for legacy KVM device assignment. Rather than looking at the vma page size and using that as the maximum size to pass to the iommu, we instead simply look at whether the next page is physically contiguous. This means we might ask the iommu to map a 4MB region, while legacy KVM might limit itself to a maximum of 2MB. Splitting our mapping path also allows us to be smarter about locked memory because we can more easily unwind if the user attempts to exceed the limit. Therefore, rather than assuming that a mapping will result in locked memory, we test each page as it is pinned to determine whether it locks RAM vs an mmap'd MMIO region. This should result in better locking granularity and less locked page fudge factors in userspace. The unmap path uses the same algorithm as legacy KVM. We don't want to track the pfn for each mapping ourselves, but we need the pfn in order to unpin pages. We therefore ask the iommu for the iova to physical address translation, ask it to unpin a page, and see how many pages were actually unpinned. iommus supporting large pages will often return something bigger than a page here, which we know will be physically contiguous and we can unpin a batch of pfns. iommus not supporting large mappings won't see an improvement in batching here as they only unmap a page at a time. With this change, we also make a clarification to the API for mapping and unmapping DMA. We can only guarantee unmaps at the same granularity as used for the original mapping. In other words, unmapping a subregion of a previous mapping is not guaranteed and may result in a larger or smaller unmapping than requested. The size field in the unmapping structure is updated to reflect this. Previously this was unmodified on mapping, always returning the the requested unmap size. This is now updated to return the actual unmap size on success, allowing userspace to appropriately track mappings. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 523 ++++++++++++++++++++++++++-------------- include/uapi/linux/vfio.h | 8 +- 2 files changed, 344 insertions(+), 187 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0e863b3ddcab..6654a7eb42d3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -60,7 +60,7 @@ struct vfio_dma { struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ - long npage; /* Number of pages */ + size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ }; @@ -74,8 +74,6 @@ struct vfio_group { * into DMA'ble space using the IOMMU */ -#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) - static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, dma_addr_t start, size_t size) { @@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, if (start + size <= dma->iova) node = node->rb_left; - else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + else if (start >= dma->iova + dma->size) node = node->rb_right; else return dma; @@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) parent = *link; dma = rb_entry(parent, struct vfio_dma, node); - if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + if (new->iova + new->size <= dma->iova) link = &(*link)->rb_left; else link = &(*link)->rb_right; @@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage) struct vwork *vwork; struct mm_struct *mm; - if (!current->mm) - return; /* process exited */ + if (!current->mm || !npage) + return; /* process exited or nothing to do */ if (down_write_trylock(¤t->mm->mmap_sem)) { current->mm->locked_vm += npage; @@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -/* Unmap DMA region */ -static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long i, unlocked = 0; - - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - unsigned long pfn; - - pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; - if (pfn) { - iommu_unmap(iommu->domain, iova, PAGE_SIZE); - unlocked += put_pfn(pfn, prot); - } - } - return unlocked; -} - -static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long unlocked; - - unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); - vfio_lock_acct(-unlocked); -} - static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) { struct page *page[1]; @@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) return ret; } -/* Map DMA region */ -static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, - unsigned long vaddr, long npage, int prot) +/* + * Attempt to pin pages. We really don't want to track all the pfns and + * the iommu can only map chunks of consecutive pfns anyway, so get the + * first page and all consecutive pages with the same locking. + */ +static long vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) { - dma_addr_t start = iova; - long i, locked = 0; - int ret; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, i; - /* Verify that pages are not already mapped */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE) - if (iommu_iova_to_phys(iommu->domain, iova)) - return -EBUSY; + if (!current->mm) + return -ENODEV; - iova = start; + ret = vaddr_get_pfn(vaddr, prot, pfn_base); + if (ret) + return ret; - if (iommu->cache) - prot |= IOMMU_CACHE; + if (is_invalid_reserved_pfn(*pfn_base)) + return 1; - /* - * XXX We break mappings into pages and use get_user_pages_fast to - * pin the pages in memory. It's been suggested that mlock might - * provide a more efficient mechanism, but nothing prevents the - * user from munlocking the pages, which could then allow the user - * access to random host memory. We also have no guarantee from the - * IOMMU API that the iommu driver can unmap sub-pages of previous - * mappings. This means we might lose an entire range if a single - * page within it is unmapped. Single page mappings are inefficient, - * but provide the most flexibility for now. - */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { + if (!lock_cap && current->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + /* Lock all the consecutive pages from pfn_base */ + for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) { - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + if (ret) + break; + + if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, prot); + break; } + if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { + put_pfn(pfn, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); + break; + } + } + + vfio_lock_acct(i); + + return i; +} + +static long vfio_unpin_pages(unsigned long pfn, long npage, + int prot, bool do_accounting) +{ + unsigned long unlocked = 0; + long i; + + for (i = 0; i < npage; i++) + unlocked += put_pfn(pfn++, prot); + + if (do_accounting) + vfio_lock_acct(-unlocked); + + return unlocked; +} + +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + dma_addr_t iova, size_t *size) +{ + dma_addr_t start = iova, end = iova + *size; + long unlocked = 0; + + while (iova < end) { + size_t unmapped; + phys_addr_t phys; + /* - * Only add actual locked pages to accounting - * XXX We're effectively marking a page locked for every - * IOVA page even though it's possible the user could be - * backing multiple IOVAs with the same vaddr. This over- - * penalizes the user process, but we currently have no - * easy way to do this properly. + * We use the IOMMU to track the physical address. This + * saves us from having a lot more entries in our mapping + * tree. The downside is that we don't track the size + * used to do the mapping. We request unmap of a single + * page, but expect IOMMUs that support large pages to + * unmap a larger chunk. */ - if (!is_invalid_reserved_pfn(pfn)) - locked++; - - ret = iommu_map(iommu->domain, iova, - (phys_addr_t)pfn << PAGE_SHIFT, - PAGE_SIZE, prot); - if (ret) { - /* Back out mappings on error */ - put_pfn(pfn, prot); - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + phys = iommu_iova_to_phys(iommu->domain, iova); + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; } + + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); + if (!unmapped) + break; + + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); + iova += unmapped; } - vfio_lock_acct(locked); + + vfio_lock_acct(-unlocked); + + *size = iova - start; + return 0; } static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) + size_t *size, struct vfio_dma *dma) { + size_t offset, overlap, tmp; struct vfio_dma *split; - long npage_lo, npage_hi; + int ret; + + /* + * Existing dma region is completely covered, unmap all. This is + * the likely case since userspace tends to map and unmap buffers + * in one shot rather than multiple mappings within a buffer. + */ + if (likely(start <= dma->iova && + start + *size >= dma->iova + dma->size)) { + *size = dma->size; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); + if (ret) + return ret; + + /* + * Did we remove more than we have? Should never happen + * since a vfio_dma is contiguous in iova and vaddr. + */ + WARN_ON(*size != dma->size); - /* Existing dma region is completely covered, unmap all */ - if (start <= dma->iova && - start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); vfio_remove_dma(iommu, dma); kfree(dma); return 0; @@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, /* Overlap low address of existing range */ if (start <= dma->iova) { - size_t overlap; + overlap = start + *size - dma->iova; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); + if (ret) + return ret; - overlap = start + size - dma->iova; - npage_lo = overlap >> PAGE_SHIFT; + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); - dma->iova += overlap; - dma->vaddr += overlap; - dma->npage -= npage_lo; + /* + * Check, we may have removed to whole vfio_dma. If not + * fixup and re-insert. + */ + if (overlap < dma->size) { + dma->iova += overlap; + dma->vaddr += overlap; + dma->size -= overlap; + vfio_insert_dma(iommu, dma); + } + *size = overlap; return 0; } /* Overlap high address of existing range */ - if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - size_t overlap; + if (start + *size >= dma->iova + dma->size) { + offset = start - dma->iova; + overlap = dma->size - offset; - overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; - npage_hi = overlap >> PAGE_SHIFT; + ret = vfio_unmap_unpin(iommu, dma, start, &overlap); + if (ret) + return ret; + + /* + * We may have unmapped the entire vfio_dma if the user is + * trying to unmap a sub-region of what was originally + * mapped. If anything left, we can resize in place since + * iova is unchanged. + */ + if (overlap < dma->size) + dma->size -= overlap; + else + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, start, npage_hi, dma->prot); - dma->npage -= npage_hi; + *size = overlap; return 0; } /* Split existing */ - npage_lo = (start - dma->iova) >> PAGE_SHIFT; - npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; + offset = start - dma->iova; - split = kzalloc(sizeof *split, GFP_KERNEL); - if (!split) - return -ENOMEM; + ret = vfio_unmap_unpin(iommu, dma, start, size); + if (ret) + return ret; - vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); + WARN_ON(!*size); + tmp = dma->size; - dma->npage = npage_lo; + /* + * Resize the lower vfio_dma in place, insert new for remaining + * upper segment. + */ + dma->size = offset; + + if (offset + *size < tmp) { + split = kzalloc(sizeof(*split), GFP_KERNEL); + if (!split) + return -ENOMEM; + + split->size = tmp - offset - *size; + split->iova = dma->iova + offset + *size; + split->vaddr = dma->vaddr + offset + *size; + split->prot = dma->prot; + vfio_insert_dma(iommu, split); + } - split->npage = npage_hi; - split->iova = start + size; - split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; - split->prot = dma->prot; - vfio_insert_dma(iommu, split); return 0; } @@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, { uint64_t mask; struct vfio_dma *dma; + size_t unmapped = 0, size; int ret = 0; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (unmap->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); mutex_lock(&iommu->lock); - while (!ret && (dma = vfio_find_dma(iommu, - unmap->iova, unmap->size))) - ret = vfio_remove_dma_overlap(iommu, unmap->iova, - unmap->size, dma); + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { + size = unmap->size; + ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); + if (ret) + break; + unmapped += size; + } mutex_unlock(&iommu->lock); + + /* + * We may unmap more than requested, update the unmap struct so + * userspace can know. + */ + unmap->size = unmapped; + + return ret; +} + +/* + * Turns out AMD IOMMU has a page table bug where it won't map large pages + * to a region that previously mapped smaller pages. This should be fixed + * soon, so this is just a temporary workaround to break mappings down into + * PAGE_SIZE. Better to map smaller pages than nothing. + */ +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + long i; + int ret; + + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + PAGE_SIZE, prot); + if (ret) + break; + } + + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) + iommu_unmap(iommu->domain, iova, PAGE_SIZE); + return ret; } static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { - struct vfio_dma *dma; - dma_addr_t iova = map->iova; - unsigned long locked, lock_limit, vaddr = map->vaddr; + dma_addr_t end, iova; + unsigned long vaddr = map->vaddr; size_t size = map->size; + long npage; int ret = 0, prot = 0; uint64_t mask; - long npage; + + end = map->iova + map->size; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!prot) return -EINVAL; /* No READ/WRITE? */ + if (iommu->cache) + prot |= IOMMU_CACHE; + if (vaddr & mask) return -EINVAL; - if (iova & mask) + if (map->iova & mask) return -EINVAL; - if (size & mask) + if (!map->size || map->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); /* Don't allow IOVA wrap */ - if (iova + size && iova + size < iova) + if (end && end < map->iova) return -EINVAL; /* Don't allow virtual address wrap */ - if (vaddr + size && vaddr + size < vaddr) - return -EINVAL; - - npage = size >> PAGE_SHIFT; - if (!npage) + if (vaddr + map->size && vaddr + map->size < vaddr) return -EINVAL; - dma = kzalloc(sizeof *dma, GFP_KERNEL); - if (!dma) - return -ENOMEM; - mutex_lock(&iommu->lock); - if (vfio_find_dma(iommu, iova, size)) { - ret = -EBUSY; - goto out_lock; + if (vfio_find_dma(iommu, map->iova, map->size)) { + mutex_unlock(&iommu->lock); + return -EEXIST; } - /* account for locked pages */ - locked = current->mm->locked_vm + npage; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - goto out_lock; - } + for (iova = map->iova; iova < end; iova += size, vaddr += size) { + struct vfio_dma *dma = NULL; + unsigned long pfn; + long i; + + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, + prot, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } - ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); - if (ret) - goto out_lock; - - dma->npage = npage; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - - /* Check if we abut a region below - nothing below 0 */ - if (iova) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - dma->iova = iova = tmp->iova; - dma->vaddr = vaddr = tmp->vaddr; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + /* Verify pages are not already mapped */ + for (i = 0; i < npage; i++) { + if (iommu_iova_to_phys(iommu->domain, + iova + (i << PAGE_SHIFT))) { + vfio_unpin_pages(pfn, npage, prot, true); + ret = -EBUSY; + break; + } + } + + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(iommu, iova, pfn, npage, prot)) { + vfio_unpin_pages(pfn, npage, prot, true); + break; + } + } + + size = npage << PAGE_SHIFT; + + /* + * Check if we abut a region below - nothing below 0. + * This is the most likely case when mapping chunks of + * physically contiguous regions within a virtual address + * range. Update the abutting entry in place since iova + * doesn't change. + */ + if (likely(iova)) { + struct vfio_dma *tmp; + tmp = vfio_find_dma(iommu, iova - 1, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr + tmp->size == vaddr) { + tmp->size += size; + + iova = tmp->iova; + size = tmp->size; + vaddr = tmp->vaddr; + dma = tmp; + } + } + + /* Check if we abut a region above - nothing above ~0 + 1 */ + if (likely(iova + size)) { + struct vfio_dma *tmp; + + tmp = vfio_find_dma(iommu, iova + size, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr == vaddr + size) { + vfio_remove_dma(iommu, tmp); + if (dma) + dma->size += tmp->size; + else + size += tmp->size; + kfree(tmp); + } } - } - /* Check if we abut a region above - nothing above ~0 + 1 */ - if (iova + size) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr == vaddr + size) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + if (!dma) { + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + iommu_unmap(iommu->domain, iova, size); + vfio_unpin_pages(pfn, npage, prot, true); + ret = -ENOMEM; + break; + } + + dma->size = size; + dma->iova = iova; + dma->vaddr = vaddr; + dma->prot = prot; + vfio_insert_dma(iommu, dma); } } - vfio_insert_dma(iommu, dma); + if (ret) { + struct vfio_dma *tmp; + iova = map->iova; + size = map->size; + while ((tmp = vfio_find_dma(iommu, iova, size))) { + if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) { + pr_warn("%s: Error rolling back failed map\n", + __func__); + break; + } + } + } -out_lock: mutex_unlock(&iommu->lock); - if (ret) - kfree(dma); return ret; } @@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data) while ((node = rb_first(&iommu->dma_list))) { struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - vfio_remove_dma(iommu, dma); - kfree(dma); + size_t size = dma->size; + vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); } iommu_domain_free(iommu->domain); @@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; + long ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (unmap.argsz < minsz || unmap.flags) return -EINVAL; - return vfio_dma_do_unmap(iommu, &unmap); + ret = vfio_dma_do_unmap(iommu, &unmap); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &unmap, minsz); } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 284ff2436829..513600612995 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map { #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) /** - * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) + * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_dma_unmap) * * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. - * Caller sets argsz. + * Caller sets argsz. The actual unmapped size is returned in the size + * field. No guarantee is made to the user that arbitrary unmaps of iova + * or size different from those used in the original mapping call will + * succeed. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; -- cgit v1.2.3 From 5c6c2b21ecc9adfc47cfaf93404937b6ecd8395d Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 21 Jun 2013 09:38:11 -0600 Subject: vfio: Provide module option to disable vfio_iommu_type1 hugepage support Add a module option to vfio_iommu_type1 to disable IOMMU hugepage support. This causes iommu_map to only be called with single page mappings, disabling the IOMMU driver's ability to use hugepages. This option can be enabled by loading vfio_iommu_type1 with disable_hugepages=1 or dynamically through sysfs. If enabled dynamically, only new mappings are restricted. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6654a7eb42d3..8a2be4e40f22 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -48,6 +48,12 @@ module_param_named(allow_unsafe_interrupts, MODULE_PARM_DESC(allow_unsafe_interrupts, "Enable VFIO IOMMU support for on platforms without interrupt remapping support."); +static bool disable_hugepages; +module_param_named(disable_hugepages, + disable_hugepages, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(disable_hugepages, + "Disable VFIO IOMMU support for IOMMU hugepages."); + struct vfio_iommu { struct iommu_domain *domain; struct mutex lock; @@ -270,6 +276,11 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, return -ENOMEM; } + if (unlikely(disable_hugepages)) { + vfio_lock_acct(1); + return 1; + } + /* Lock all the consecutive pages from pfn_base */ for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; -- cgit v1.2.3 From f5bfdbf252ad54aa4b6f765cbe89750087fcc155 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Jun 2013 16:01:44 -0600 Subject: vfio/type1: Fix missed frees and zero sized removes With hugepage support we can only properly aligned and sized ranges. We only guarantee that we can unmap the same ranges mapped and not arbitrary sub-ranges. This means we might not free anything or might free more than requested. The vfio unmap interface started storing the unmapped size to return to userspace to handle this. This patch fixes a few places where we don't properly handle those cases, moves a memory allocation to a place where failure is an option and checks our loops to make sure we don't get into an infinite loop trying to remove an overlap. Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 77 ++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 35 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 8a2be4e40f22..98231d10890c 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -370,6 +370,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, struct vfio_dma *split; int ret; + if (!*size) + return 0; + /* * Existing dma region is completely covered, unmap all. This is * the likely case since userspace tends to map and unmap buffers @@ -411,7 +414,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, dma->vaddr += overlap; dma->size -= overlap; vfio_insert_dma(iommu, dma); - } + } else + kfree(dma); + *size = overlap; return 0; } @@ -425,48 +430,41 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, if (ret) return ret; - /* - * We may have unmapped the entire vfio_dma if the user is - * trying to unmap a sub-region of what was originally - * mapped. If anything left, we can resize in place since - * iova is unchanged. - */ - if (overlap < dma->size) - dma->size -= overlap; - else - vfio_remove_dma(iommu, dma); - + dma->size -= overlap; *size = overlap; return 0; } /* Split existing */ + split = kzalloc(sizeof(*split), GFP_KERNEL); + if (!split) + return -ENOMEM; + offset = start - dma->iova; ret = vfio_unmap_unpin(iommu, dma, start, size); if (ret) return ret; - WARN_ON(!*size); + if (!*size) { + kfree(split); + return -EINVAL; + } + tmp = dma->size; - /* - * Resize the lower vfio_dma in place, insert new for remaining - * upper segment. - */ + /* Resize the lower vfio_dma in place, before the below insert */ dma->size = offset; - if (offset + *size < tmp) { - split = kzalloc(sizeof(*split), GFP_KERNEL); - if (!split) - return -ENOMEM; - + /* Insert new for remainder, assuming it didn't all get unmapped */ + if (likely(offset + *size < tmp)) { split->size = tmp - offset - *size; split->iova = dma->iova + offset + *size; split->vaddr = dma->vaddr + offset + *size; split->prot = dma->prot; vfio_insert_dma(iommu, split); - } + } else + kfree(split); return 0; } @@ -483,7 +481,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (unmap->iova & mask) return -EINVAL; - if (unmap->size & mask) + if (!unmap->size || unmap->size & mask) return -EINVAL; WARN_ON(mask & PAGE_MASK); @@ -493,7 +491,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { size = unmap->size; ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); - if (ret) + if (ret || !size) break; unmapped += size; } @@ -635,7 +633,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (tmp && tmp->prot == prot && tmp->vaddr + tmp->size == vaddr) { tmp->size += size; - iova = tmp->iova; size = tmp->size; vaddr = tmp->vaddr; @@ -643,19 +640,28 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, } } - /* Check if we abut a region above - nothing above ~0 + 1 */ + /* + * Check if we abut a region above - nothing above ~0 + 1. + * If we abut above and below, remove and free. If only + * abut above, remove, modify, reinsert. + */ if (likely(iova + size)) { struct vfio_dma *tmp; - tmp = vfio_find_dma(iommu, iova + size, 1); if (tmp && tmp->prot == prot && tmp->vaddr == vaddr + size) { vfio_remove_dma(iommu, tmp); - if (dma) + if (dma) { dma->size += tmp->size; - else + kfree(tmp); + } else { size += tmp->size; - kfree(tmp); + tmp->size = size; + tmp->iova = iova; + tmp->vaddr = vaddr; + vfio_insert_dma(iommu, tmp); + dma = tmp; + } } } @@ -681,11 +687,10 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, iova = map->iova; size = map->size; while ((tmp = vfio_find_dma(iommu, iova, size))) { - if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) { - pr_warn("%s: Error rolling back failed map\n", - __func__); + int r = vfio_remove_dma_overlap(iommu, iova, + &size, tmp); + if (WARN_ON(r || !size)) break; - } } } @@ -813,6 +818,8 @@ static void vfio_iommu_type1_release(void *iommu_data) struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); size_t size = dma->size; vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); + if (WARN_ON(!size)) + break; } iommu_domain_free(iommu->domain); -- cgit v1.2.3 From 6d6768c61b39a2184bc11bb0342e4f7f35642bcc Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 25 Jun 2013 16:06:54 -0600 Subject: vfio: Limit group opens vfio_group_fops_open attempts to limit concurrent sessions by disallowing opens once group->container is set. This really doesn't do what we want and allow for inconsistent behavior, for instance a group can be opened twice, then a container set giving the user two file descriptors to the group. But then it won't allow more to be opened. There's not much reason to have the group opened multiple times since most access is through devices or the container, so complete what the original code intended and only allow a single instance. Signed-off-by: Alex Williamson --- drivers/vfio/vfio.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index 6d78736563de..d30f44da8762 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -76,6 +76,7 @@ struct vfio_group { struct notifier_block nb; struct list_head vfio_next; struct list_head container_next; + atomic_t opened; }; struct vfio_device { @@ -206,6 +207,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); atomic_set(&group->container_users, 0); + atomic_set(&group->opened, 0); group->iommu_group = iommu_group; group->nb.notifier_call = vfio_iommu_group_notifier; @@ -1236,12 +1238,22 @@ static long vfio_group_fops_compat_ioctl(struct file *filep, static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group; + int opened; group = vfio_group_get_from_minor(iminor(inode)); if (!group) return -ENODEV; + /* Do we need multiple instances of the group open? Seems not. */ + opened = atomic_cmpxchg(&group->opened, 0, 1); + if (opened) { + vfio_group_put(group); + return -EBUSY; + } + + /* Is something still in use from a previous open? */ if (group->container) { + atomic_dec(&group->opened); vfio_group_put(group); return -EBUSY; } @@ -1259,6 +1271,8 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) vfio_group_try_dissolve_container(group); + atomic_dec(&group->opened); + vfio_group_put(group); return 0; -- cgit v1.2.3 From 8d38ef1948bd415a5cb653a5c0ec16f3402aaca1 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Mon, 1 Jul 2013 08:28:58 -0600 Subject: vfio/type1: Fix leak on error path We also don't handle unpinning zero pages as an error on other exits so we can fix that inconsistency by rolling in the next conditional return. Reported-by: Dan Carpenter Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 98231d10890c..a9807dea3887 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -436,6 +436,12 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, } /* Split existing */ + + /* + * Allocate our tracking structure early even though it may not + * be used. An Allocation failure later loses track of pages and + * is more difficult to unwind. + */ split = kzalloc(sizeof(*split), GFP_KERNEL); if (!split) return -ENOMEM; @@ -443,12 +449,9 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, offset = start - dma->iova; ret = vfio_unmap_unpin(iommu, dma, start, size); - if (ret) - return ret; - - if (!*size) { + if (ret || !*size) { kfree(split); - return -EINVAL; + return ret; } tmp = dma->size; -- cgit v1.2.3