diff options
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 523 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 8 |
2 files changed, 344 insertions, 187 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0e863b3ddcab..6654a7eb42d3 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -60,7 +60,7 @@ struct vfio_dma { struct rb_node node; dma_addr_t iova; /* Device address */ unsigned long vaddr; /* Process virtual addr */ - long npage; /* Number of pages */ + size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ }; @@ -74,8 +74,6 @@ struct vfio_group { * into DMA'ble space using the IOMMU */ -#define NPAGE_TO_SIZE(npage) ((size_t)(npage) << PAGE_SHIFT) - static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, dma_addr_t start, size_t size) { @@ -86,7 +84,7 @@ static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu, if (start + size <= dma->iova) node = node->rb_left; - else if (start >= dma->iova + NPAGE_TO_SIZE(dma->npage)) + else if (start >= dma->iova + dma->size) node = node->rb_right; else return dma; @@ -104,7 +102,7 @@ static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new) parent = *link; dma = rb_entry(parent, struct vfio_dma, node); - if (new->iova + NPAGE_TO_SIZE(new->npage) <= dma->iova) + if (new->iova + new->size <= dma->iova) link = &(*link)->rb_left; else link = &(*link)->rb_right; @@ -144,8 +142,8 @@ static void vfio_lock_acct(long npage) struct vwork *vwork; struct mm_struct *mm; - if (!current->mm) - return; /* process exited */ + if (!current->mm || !npage) + return; /* process exited or nothing to do */ if (down_write_trylock(¤t->mm->mmap_sem)) { current->mm->locked_vm += npage; @@ -217,33 +215,6 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -/* Unmap DMA region */ -static long __vfio_dma_do_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long i, unlocked = 0; - - for (i = 0; i < npage; i++, iova += PAGE_SIZE) { - unsigned long pfn; - - pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT; - if (pfn) { - iommu_unmap(iommu->domain, iova, PAGE_SIZE); - unlocked += put_pfn(pfn, prot); - } - } - return unlocked; -} - -static void vfio_dma_unmap(struct vfio_iommu *iommu, dma_addr_t iova, - long npage, int prot) -{ - long unlocked; - - unlocked = __vfio_dma_do_unmap(iommu, iova, npage, prot); - vfio_lock_acct(-unlocked); -} - static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) { struct page *page[1]; @@ -270,79 +241,142 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) return ret; } -/* Map DMA region */ -static int __vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova, - unsigned long vaddr, long npage, int prot) +/* + * Attempt to pin pages. We really don't want to track all the pfns and + * the iommu can only map chunks of consecutive pfns anyway, so get the + * first page and all consecutive pages with the same locking. + */ +static long vfio_pin_pages(unsigned long vaddr, long npage, + int prot, unsigned long *pfn_base) { - dma_addr_t start = iova; - long i, locked = 0; - int ret; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, i; - /* Verify that pages are not already mapped */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE) - if (iommu_iova_to_phys(iommu->domain, iova)) - return -EBUSY; + if (!current->mm) + return -ENODEV; - iova = start; + ret = vaddr_get_pfn(vaddr, prot, pfn_base); + if (ret) + return ret; - if (iommu->cache) - prot |= IOMMU_CACHE; + if (is_invalid_reserved_pfn(*pfn_base)) + return 1; - /* - * XXX We break mappings into pages and use get_user_pages_fast to - * pin the pages in memory. It's been suggested that mlock might - * provide a more efficient mechanism, but nothing prevents the - * user from munlocking the pages, which could then allow the user - * access to random host memory. We also have no guarantee from the - * IOMMU API that the iommu driver can unmap sub-pages of previous - * mappings. This means we might lose an entire range if a single - * page within it is unmapped. Single page mappings are inefficient, - * but provide the most flexibility for now. - */ - for (i = 0; i < npage; i++, iova += PAGE_SIZE, vaddr += PAGE_SIZE) { + if (!lock_cap && current->mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + return -ENOMEM; + } + + /* Lock all the consecutive pages from pfn_base */ + for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { unsigned long pfn = 0; ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) { - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + if (ret) + break; + + if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, prot); + break; } + if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { + put_pfn(pfn, prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); + break; + } + } + + vfio_lock_acct(i); + + return i; +} + +static long vfio_unpin_pages(unsigned long pfn, long npage, + int prot, bool do_accounting) +{ + unsigned long unlocked = 0; + long i; + + for (i = 0; i < npage; i++) + unlocked += put_pfn(pfn++, prot); + + if (do_accounting) + vfio_lock_acct(-unlocked); + + return unlocked; +} + +static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + dma_addr_t iova, size_t *size) +{ + dma_addr_t start = iova, end = iova + *size; + long unlocked = 0; + + while (iova < end) { + size_t unmapped; + phys_addr_t phys; + /* - * Only add actual locked pages to accounting - * XXX We're effectively marking a page locked for every - * IOVA page even though it's possible the user could be - * backing multiple IOVAs with the same vaddr. This over- - * penalizes the user process, but we currently have no - * easy way to do this properly. + * We use the IOMMU to track the physical address. This + * saves us from having a lot more entries in our mapping + * tree. The downside is that we don't track the size + * used to do the mapping. We request unmap of a single + * page, but expect IOMMUs that support large pages to + * unmap a larger chunk. */ - if (!is_invalid_reserved_pfn(pfn)) - locked++; - - ret = iommu_map(iommu->domain, iova, - (phys_addr_t)pfn << PAGE_SHIFT, - PAGE_SIZE, prot); - if (ret) { - /* Back out mappings on error */ - put_pfn(pfn, prot); - __vfio_dma_do_unmap(iommu, start, i, prot); - return ret; + phys = iommu_iova_to_phys(iommu->domain, iova); + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; } + + unmapped = iommu_unmap(iommu->domain, iova, PAGE_SIZE); + if (!unmapped) + break; + + unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + dma->prot, false); + iova += unmapped; } - vfio_lock_acct(locked); + + vfio_lock_acct(-unlocked); + + *size = iova - start; + return 0; } static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma *dma) + size_t *size, struct vfio_dma *dma) { + size_t offset, overlap, tmp; struct vfio_dma *split; - long npage_lo, npage_hi; + int ret; + + /* + * Existing dma region is completely covered, unmap all. This is + * the likely case since userspace tends to map and unmap buffers + * in one shot rather than multiple mappings within a buffer. + */ + if (likely(start <= dma->iova && + start + *size >= dma->iova + dma->size)) { + *size = dma->size; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, size); + if (ret) + return ret; + + /* + * Did we remove more than we have? Should never happen + * since a vfio_dma is contiguous in iova and vaddr. + */ + WARN_ON(*size != dma->size); - /* Existing dma region is completely covered, unmap all */ - if (start <= dma->iova && - start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); vfio_remove_dma(iommu, dma); kfree(dma); return 0; @@ -350,47 +384,79 @@ static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start, /* Overlap low address of existing range */ if (start <= dma->iova) { - size_t overlap; + overlap = start + *size - dma->iova; + ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap); + if (ret) + return ret; - overlap = start + size - dma->iova; - npage_lo = overlap >> PAGE_SHIFT; + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, dma->iova, npage_lo, dma->prot); - dma->iova += overlap; - dma->vaddr += overlap; - dma->npage -= npage_lo; + /* + * Check, we may have removed to whole vfio_dma. If not + * fixup and re-insert. + */ + if (overlap < dma->size) { + dma->iova += overlap; + dma->vaddr += overlap; + dma->size -= overlap; + vfio_insert_dma(iommu, dma); + } + *size = overlap; return 0; } /* Overlap high address of existing range */ - if (start + size >= dma->iova + NPAGE_TO_SIZE(dma->npage)) { - size_t overlap; + if (start + *size >= dma->iova + dma->size) { + offset = start - dma->iova; + overlap = dma->size - offset; - overlap = dma->iova + NPAGE_TO_SIZE(dma->npage) - start; - npage_hi = overlap >> PAGE_SHIFT; + ret = vfio_unmap_unpin(iommu, dma, start, &overlap); + if (ret) + return ret; + + /* + * We may have unmapped the entire vfio_dma if the user is + * trying to unmap a sub-region of what was originally + * mapped. If anything left, we can resize in place since + * iova is unchanged. + */ + if (overlap < dma->size) + dma->size -= overlap; + else + vfio_remove_dma(iommu, dma); - vfio_dma_unmap(iommu, start, npage_hi, dma->prot); - dma->npage -= npage_hi; + *size = overlap; return 0; } /* Split existing */ - npage_lo = (start - dma->iova) >> PAGE_SHIFT; - npage_hi = dma->npage - (size >> PAGE_SHIFT) - npage_lo; + offset = start - dma->iova; - split = kzalloc(sizeof *split, GFP_KERNEL); - if (!split) - return -ENOMEM; + ret = vfio_unmap_unpin(iommu, dma, start, size); + if (ret) + return ret; - vfio_dma_unmap(iommu, start, size >> PAGE_SHIFT, dma->prot); + WARN_ON(!*size); + tmp = dma->size; - dma->npage = npage_lo; + /* + * Resize the lower vfio_dma in place, insert new for remaining + * upper segment. + */ + dma->size = offset; + + if (offset + *size < tmp) { + split = kzalloc(sizeof(*split), GFP_KERNEL); + if (!split) + return -ENOMEM; + + split->size = tmp - offset - *size; + split->iova = dma->iova + offset + *size; + split->vaddr = dma->vaddr + offset + *size; + split->prot = dma->prot; + vfio_insert_dma(iommu, split); + } - split->npage = npage_hi; - split->iova = start + size; - split->vaddr = dma->vaddr + NPAGE_TO_SIZE(npage_lo) + size; - split->prot = dma->prot; - vfio_insert_dma(iommu, split); return 0; } @@ -399,6 +465,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, { uint64_t mask; struct vfio_dma *dma; + size_t unmapped = 0, size; int ret = 0; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -408,30 +475,66 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, if (unmap->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); mutex_lock(&iommu->lock); - while (!ret && (dma = vfio_find_dma(iommu, - unmap->iova, unmap->size))) - ret = vfio_remove_dma_overlap(iommu, unmap->iova, - unmap->size, dma); + while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { + size = unmap->size; + ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma); + if (ret) + break; + unmapped += size; + } mutex_unlock(&iommu->lock); + + /* + * We may unmap more than requested, update the unmap struct so + * userspace can know. + */ + unmap->size = unmapped; + + return ret; +} + +/* + * Turns out AMD IOMMU has a page table bug where it won't map large pages + * to a region that previously mapped smaller pages. This should be fixed + * soon, so this is just a temporary workaround to break mappings down into + * PAGE_SIZE. Better to map smaller pages than nothing. + */ +static int map_try_harder(struct vfio_iommu *iommu, dma_addr_t iova, + unsigned long pfn, long npage, int prot) +{ + long i; + int ret; + + for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) { + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + PAGE_SIZE, prot); + if (ret) + break; + } + + for (; i < npage && i > 0; i--, iova -= PAGE_SIZE) + iommu_unmap(iommu->domain, iova, PAGE_SIZE); + return ret; } static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { - struct vfio_dma *dma; - dma_addr_t iova = map->iova; - unsigned long locked, lock_limit, vaddr = map->vaddr; + dma_addr_t end, iova; + unsigned long vaddr = map->vaddr; size_t size = map->size; + long npage; int ret = 0, prot = 0; uint64_t mask; - long npage; + + end = map->iova + map->size; mask = ((uint64_t)1 << __ffs(iommu->domain->ops->pgsize_bitmap)) - 1; @@ -444,92 +547,138 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, if (!prot) return -EINVAL; /* No READ/WRITE? */ + if (iommu->cache) + prot |= IOMMU_CACHE; + if (vaddr & mask) return -EINVAL; - if (iova & mask) + if (map->iova & mask) return -EINVAL; - if (size & mask) + if (!map->size || map->size & mask) return -EINVAL; - /* XXX We still break these down into PAGE_SIZE */ WARN_ON(mask & PAGE_MASK); /* Don't allow IOVA wrap */ - if (iova + size && iova + size < iova) + if (end && end < map->iova) return -EINVAL; /* Don't allow virtual address wrap */ - if (vaddr + size && vaddr + size < vaddr) - return -EINVAL; - - npage = size >> PAGE_SHIFT; - if (!npage) + if (vaddr + map->size && vaddr + map->size < vaddr) return -EINVAL; - dma = kzalloc(sizeof *dma, GFP_KERNEL); - if (!dma) - return -ENOMEM; - mutex_lock(&iommu->lock); - if (vfio_find_dma(iommu, iova, size)) { - ret = -EBUSY; - goto out_lock; + if (vfio_find_dma(iommu, map->iova, map->size)) { + mutex_unlock(&iommu->lock); + return -EEXIST; } - /* account for locked pages */ - locked = current->mm->locked_vm + npage; - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, rlimit(RLIMIT_MEMLOCK)); - ret = -ENOMEM; - goto out_lock; - } + for (iova = map->iova; iova < end; iova += size, vaddr += size) { + struct vfio_dma *dma = NULL; + unsigned long pfn; + long i; + + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages(vaddr, (end - iova) >> PAGE_SHIFT, + prot, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } - ret = __vfio_dma_map(iommu, iova, vaddr, npage, prot); - if (ret) - goto out_lock; - - dma->npage = npage; - dma->iova = iova; - dma->vaddr = vaddr; - dma->prot = prot; - - /* Check if we abut a region below - nothing below 0 */ - if (iova) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova - 1, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr + NPAGE_TO_SIZE(tmp->npage) == vaddr) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - dma->iova = iova = tmp->iova; - dma->vaddr = vaddr = tmp->vaddr; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + /* Verify pages are not already mapped */ + for (i = 0; i < npage; i++) { + if (iommu_iova_to_phys(iommu->domain, + iova + (i << PAGE_SHIFT))) { + vfio_unpin_pages(pfn, npage, prot, true); + ret = -EBUSY; + break; + } + } + + ret = iommu_map(iommu->domain, iova, + (phys_addr_t)pfn << PAGE_SHIFT, + npage << PAGE_SHIFT, prot); + if (ret) { + if (ret != -EBUSY || + map_try_harder(iommu, iova, pfn, npage, prot)) { + vfio_unpin_pages(pfn, npage, prot, true); + break; + } + } + + size = npage << PAGE_SHIFT; + + /* + * Check if we abut a region below - nothing below 0. + * This is the most likely case when mapping chunks of + * physically contiguous regions within a virtual address + * range. Update the abutting entry in place since iova + * doesn't change. + */ + if (likely(iova)) { + struct vfio_dma *tmp; + tmp = vfio_find_dma(iommu, iova - 1, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr + tmp->size == vaddr) { + tmp->size += size; + + iova = tmp->iova; + size = tmp->size; + vaddr = tmp->vaddr; + dma = tmp; + } + } + + /* Check if we abut a region above - nothing above ~0 + 1 */ + if (likely(iova + size)) { + struct vfio_dma *tmp; + + tmp = vfio_find_dma(iommu, iova + size, 1); + if (tmp && tmp->prot == prot && + tmp->vaddr == vaddr + size) { + vfio_remove_dma(iommu, tmp); + if (dma) + dma->size += tmp->size; + else + size += tmp->size; + kfree(tmp); + } } - } - /* Check if we abut a region above - nothing above ~0 + 1 */ - if (iova + size) { - struct vfio_dma *tmp = vfio_find_dma(iommu, iova + size, 1); - if (tmp && tmp->prot == prot && - tmp->vaddr == vaddr + size) { - vfio_remove_dma(iommu, tmp); - dma->npage += tmp->npage; - kfree(tmp); - npage = dma->npage; - size = NPAGE_TO_SIZE(npage); + if (!dma) { + dma = kzalloc(sizeof(*dma), GFP_KERNEL); + if (!dma) { + iommu_unmap(iommu->domain, iova, size); + vfio_unpin_pages(pfn, npage, prot, true); + ret = -ENOMEM; + break; + } + + dma->size = size; + dma->iova = iova; + dma->vaddr = vaddr; + dma->prot = prot; + vfio_insert_dma(iommu, dma); } } - vfio_insert_dma(iommu, dma); + if (ret) { + struct vfio_dma *tmp; + iova = map->iova; + size = map->size; + while ((tmp = vfio_find_dma(iommu, iova, size))) { + if (vfio_remove_dma_overlap(iommu, iova, &size, tmp)) { + pr_warn("%s: Error rolling back failed map\n", + __func__); + break; + } + } + } -out_lock: mutex_unlock(&iommu->lock); - if (ret) - kfree(dma); return ret; } @@ -651,9 +800,8 @@ static void vfio_iommu_type1_release(void *iommu_data) while ((node = rb_first(&iommu->dma_list))) { struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node); - vfio_dma_unmap(iommu, dma->iova, dma->npage, dma->prot); - vfio_remove_dma(iommu, dma); - kfree(dma); + size_t size = dma->size; + vfio_remove_dma_overlap(iommu, dma->iova, &size, dma); } iommu_domain_free(iommu->domain); @@ -708,6 +856,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, } else if (cmd == VFIO_IOMMU_UNMAP_DMA) { struct vfio_iommu_type1_dma_unmap unmap; + long ret; minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); @@ -717,7 +866,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, if (unmap.argsz < minsz || unmap.flags) return -EINVAL; - return vfio_dma_do_unmap(iommu, &unmap); + ret = vfio_dma_do_unmap(iommu, &unmap); + if (ret) + return ret; + + return copy_to_user((void __user *)arg, &unmap, minsz); } return -ENOTTY; diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 284ff2436829..513600612995 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -361,10 +361,14 @@ struct vfio_iommu_type1_dma_map { #define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13) /** - * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap) + * VFIO_IOMMU_UNMAP_DMA - _IOWR(VFIO_TYPE, VFIO_BASE + 14, + * struct vfio_dma_unmap) * * Unmap IO virtual addresses using the provided struct vfio_dma_unmap. - * Caller sets argsz. + * Caller sets argsz. The actual unmapped size is returned in the size + * field. No guarantee is made to the user that arbitrary unmaps of iova + * or size different from those used in the original mapping call will + * succeed. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; |