diff options
author | Alexey Kardashevskiy <aik@ozlabs.ru> | 2017-12-13 05:31:31 +0300 |
---|---|---|
committer | Alex Williamson <alex.williamson@redhat.com> | 2017-12-20 20:05:06 +0300 |
commit | a32295c612c57990d17fb0f41e7134394b2f35f6 (patch) | |
tree | b6e59851062efc6a57040c3a8e5312826d9c5a3e | |
parent | dda01f787df9f9e46f1c0bf8aa11f246e300750d (diff) | |
download | linux-a32295c612c57990d17fb0f41e7134394b2f35f6.tar.xz |
vfio-pci: Allow mapping MSIX BAR
By default VFIO disables mapping of MSIX BAR to the userspace as
the userspace may program it in a way allowing spurious interrupts;
instead the userspace uses the VFIO_DEVICE_SET_IRQS ioctl.
In order to eliminate guessing from the userspace about what is
mmapable, VFIO also advertises a sparse list of regions allowed to mmap.
This works fine as long as the system page size equals to the MSIX
alignment requirement which is 4KB. However with a bigger page size
the existing code prohibits mapping non-MSIX parts of a page with MSIX
structures so these parts have to be emulated via slow reads/writes on
a VFIO device fd. If these emulated bits are accessed often, this has
serious impact on performance.
This allows mmap of the entire BAR containing MSIX vector table.
This removes the sparse capability for PCI devices as it becomes useless.
As the userspace needs to know for sure whether mmapping of the MSIX
vector containing data can succeed, this adds a new capability -
VFIO_REGION_INFO_CAP_MSIX_MAPPABLE - which explicitly tells the userspace
that the entire BAR can be mmapped.
This does not touch the MSIX mangling in the BAR read/write handlers as
we are doing this just to enable direct access to non MSIX registers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw - fixup whitespace, trim function name]
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 64 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 10 |
2 files changed, 18 insertions, 56 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index de48acd29a84..b0f759476900 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -565,47 +565,15 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, return walk.ret; } -static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, - struct vfio_info_cap *caps) +static int msix_mmappable_cap(struct vfio_pci_device *vdev, + struct vfio_info_cap *caps) { - struct vfio_region_info_cap_sparse_mmap *sparse; - size_t end, size; - int nr_areas = 2, i = 0, ret; - - end = pci_resource_len(vdev->pdev, vdev->msix_bar); - - /* If MSI-X table is aligned to the start or end, only one area */ - if (((vdev->msix_offset & PAGE_MASK) == 0) || - (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) >= end)) - nr_areas = 1; - - size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); - - sparse = kzalloc(size, GFP_KERNEL); - if (!sparse) - return -ENOMEM; - - sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP; - sparse->header.version = 1; - sparse->nr_areas = nr_areas; - - if (vdev->msix_offset & PAGE_MASK) { - sparse->areas[i].offset = 0; - sparse->areas[i].size = vdev->msix_offset & PAGE_MASK; - i++; - } - - if (PAGE_ALIGN(vdev->msix_offset + vdev->msix_size) < end) { - sparse->areas[i].offset = PAGE_ALIGN(vdev->msix_offset + - vdev->msix_size); - sparse->areas[i].size = end - sparse->areas[i].offset; - i++; - } - - ret = vfio_info_add_capability(caps, &sparse->header, size); - kfree(sparse); + struct vfio_info_cap_header header = { + .id = VFIO_REGION_INFO_CAP_MSIX_MAPPABLE, + .version = 1 + }; - return ret; + return vfio_info_add_capability(caps, &header, sizeof(header)); } int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, @@ -696,7 +664,7 @@ static long vfio_pci_ioctl(void *device_data, if (vdev->bar_mmap_supported[info.index]) { info.flags |= VFIO_REGION_INFO_FLAG_MMAP; if (info.index == vdev->msix_bar) { - ret = msix_sparse_mmap_cap(vdev, &caps); + ret = msix_mmappable_cap(vdev, &caps); if (ret) return ret; } @@ -1127,22 +1095,6 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) if (req_start + req_len > phys_len) return -EINVAL; - if (index == vdev->msix_bar) { - /* - * Disallow mmaps overlapping the MSI-X table; users don't - * get to touch this directly. We could find somewhere - * else to map the overlap, but page granularity is only - * a recommendation, not a requirement, so the user needs - * to know which bits are real. Requiring them to mmap - * around the table makes that clear. - */ - - /* If neither entirely above nor below, then it overlaps */ - if (!(req_start >= vdev->msix_offset + vdev->msix_size || - req_start + req_len <= vdev->msix_offset)) - return -EINVAL; - } - /* * Even though we don't make use of the barmap for the mmap, * we need to request the region and the barmap tracks that. diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index e3301dbd27d4..0d914350f7bf 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -301,6 +301,16 @@ struct vfio_region_info_cap_type { #define VFIO_REGION_SUBTYPE_INTEL_IGD_HOST_CFG (2) #define VFIO_REGION_SUBTYPE_INTEL_IGD_LPC_CFG (3) +/* + * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped + * which allows direct access to non-MSIX registers which happened to be within + * the same system page. + * + * Even though the userspace gets direct access to the MSIX data, the existing + * VFIO_DEVICE_SET_IRQS interface must still be used for MSIX configuration. + */ +#define VFIO_REGION_INFO_CAP_MSIX_MAPPABLE 3 + /** * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9, * struct vfio_irq_info) |