diff options
55 files changed, 4350 insertions, 458 deletions
diff --git a/Documentation/driver-api/vfio.rst b/Documentation/driver-api/vfio.rst index 68abc089d6dd..633d11c7fa71 100644 --- a/Documentation/driver-api/vfio.rst +++ b/Documentation/driver-api/vfio.rst @@ -239,6 +239,137 @@ group and can access them as follows:: /* Gratuitous device reset and go... */ ioctl(device, VFIO_DEVICE_RESET); +IOMMUFD and vfio_iommu_type1 +---------------------------- + +IOMMUFD is the new user API to manage I/O page tables from userspace. +It intends to be the portal of delivering advanced userspace DMA +features (nested translation [5]_, PASID [6]_, etc.) while also providing +a backwards compatibility interface for existing VFIO_TYPE1v2_IOMMU use +cases. Eventually the vfio_iommu_type1 driver, as well as the legacy +vfio container and group model is intended to be deprecated. + +The IOMMUFD backwards compatibility interface can be enabled two ways. +In the first method, the kernel can be configured with +CONFIG_IOMMUFD_VFIO_CONTAINER, in which case the IOMMUFD subsystem +transparently provides the entire infrastructure for the VFIO +container and IOMMU backend interfaces. The compatibility mode can +also be accessed if the VFIO container interface, ie. /dev/vfio/vfio is +simply symlink'd to /dev/iommu. Note that at the time of writing, the +compatibility mode is not entirely feature complete relative to +VFIO_TYPE1v2_IOMMU (ex. DMA mapping MMIO) and does not attempt to +provide compatibility to the VFIO_SPAPR_TCE_IOMMU interface. Therefore +it is not generally advisable at this time to switch from native VFIO +implementations to the IOMMUFD compatibility interfaces. + +Long term, VFIO users should migrate to device access through the cdev +interface described below, and native access through the IOMMUFD +provided interfaces. + +VFIO Device cdev +---------------- + +Traditionally user acquires a device fd via VFIO_GROUP_GET_DEVICE_FD +in a VFIO group. + +With CONFIG_VFIO_DEVICE_CDEV=y the user can now acquire a device fd +by directly opening a character device /dev/vfio/devices/vfioX where +"X" is the number allocated uniquely by VFIO for registered devices. +cdev interface does not support noiommu devices, so user should use +the legacy group interface if noiommu is wanted. + +The cdev only works with IOMMUFD. Both VFIO drivers and applications +must adapt to the new cdev security model which requires using +VFIO_DEVICE_BIND_IOMMUFD to claim DMA ownership before starting to +actually use the device. Once BIND succeeds then a VFIO device can +be fully accessed by the user. + +VFIO device cdev doesn't rely on VFIO group/container/iommu drivers. +Hence those modules can be fully compiled out in an environment +where no legacy VFIO application exists. + +So far SPAPR does not support IOMMUFD yet. So it cannot support device +cdev either. + +vfio device cdev access is still bound by IOMMU group semantics, ie. there +can be only one DMA owner for the group. Devices belonging to the same +group can not be bound to multiple iommufd_ctx or shared between native +kernel and vfio bus driver or other driver supporting the driver_managed_dma +flag. A violation of this ownership requirement will fail at the +VFIO_DEVICE_BIND_IOMMUFD ioctl, which gates full device access. + +Device cdev Example +------------------- + +Assume user wants to access PCI device 0000:6a:01.0:: + + $ ls /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/ + vfio0 + +This device is therefore represented as vfio0. The user can verify +its existence:: + + $ ls -l /dev/vfio/devices/vfio0 + crw------- 1 root root 511, 0 Feb 16 01:22 /dev/vfio/devices/vfio0 + $ cat /sys/bus/pci/devices/0000:6a:01.0/vfio-dev/vfio0/dev + 511:0 + $ ls -l /dev/char/511\:0 + lrwxrwxrwx 1 root root 21 Feb 16 01:22 /dev/char/511:0 -> ../vfio/devices/vfio0 + +Then provide the user with access to the device if unprivileged +operation is desired:: + + $ chown user:user /dev/vfio/devices/vfio0 + +Finally the user could get cdev fd by:: + + cdev_fd = open("/dev/vfio/devices/vfio0", O_RDWR); + +An opened cdev_fd doesn't give the user any permission of accessing +the device except binding the cdev_fd to an iommufd. After that point +then the device is fully accessible including attaching it to an +IOMMUFD IOAS/HWPT to enable userspace DMA:: + + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + }; + struct iommu_ioas_map map = { + .size = sizeof(map), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .__reserved = 0, + }; + + iommufd = open("/dev/iommu", O_RDWR); + + bind.iommufd = iommufd; + ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + + ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data); + attach_data.pt_id = alloc_data.out_ioas_id; + ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + + /* Allocate some space and setup a DMA mapping */ + map.user_va = (int64_t)mmap(0, 1024 * 1024, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + map.iova = 0; /* 1MB starting at 0x0 from device view */ + map.length = 1024 * 1024; + map.ioas_id = alloc_data.out_ioas_id;; + + ioctl(iommufd, IOMMU_IOAS_MAP, &map); + + /* Other device operations as stated in "VFIO Usage Example" */ + VFIO User API ------------------------------------------------------------------------------- @@ -279,6 +410,7 @@ similar to a file operations structure:: struct iommufd_ctx *ictx, u32 *out_device_id); void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); + void (*detach_ioas)(struct vfio_device *vdev); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -315,9 +447,10 @@ container_of(). - The [un]bind_iommufd callbacks are issued when the device is bound to and unbound from iommufd. - - The attach_ioas callback is issued when the device is attached to an - IOAS managed by the bound iommufd. The attached IOAS is automatically - detached when the device is unbound from iommufd. + - The [de]attach_ioas callback is issued when the device is attached to + and detached from an IOAS managed by the bound iommufd. However, the + attached IOAS can also be automatically detached when the device is + unbound from iommufd. - The read/write/mmap callbacks implement the device region access defined by the device's own VFIO_DEVICE_GET_REGION_INFO ioctl. @@ -564,3 +697,11 @@ This implementation has some specifics: \-0d.1 00:1e.0 PCI bridge: Intel Corporation 82801 PCI Bridge (rev 90) + +.. [5] Nested translation is an IOMMU feature which supports two stage + address translations. This improves the address translation efficiency + in IOMMU virtualization. + +.. [6] PASID stands for Process Address Space ID, introduced by PCI + Express. It is a prerequisite for Shared Virtual Addressing (SVA) + and Scalable I/O Virtualization (Scalable IOV). diff --git a/Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst b/Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst new file mode 100644 index 000000000000..7a6bc848a2b2 --- /dev/null +++ b/Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst @@ -0,0 +1,79 @@ +.. SPDX-License-Identifier: GPL-2.0+ +.. note: can be edited and viewed with /usr/bin/formiko-vim + +========================================================== +PCI VFIO driver for the AMD/Pensando(R) DSC adapter family +========================================================== + +AMD/Pensando Linux VFIO PCI Device Driver +Copyright(c) 2023 Advanced Micro Devices, Inc. + +Overview +======== + +The ``pds-vfio-pci`` module is a PCI driver that supports Live Migration +capable Virtual Function (VF) devices in the DSC hardware. + +Using the device +================ + +The pds-vfio-pci device is enabled via multiple configuration steps and +depends on the ``pds_core`` driver to create and enable SR-IOV Virtual +Function devices. + +Shown below are the steps to bind the driver to a VF and also to the +associated auxiliary device created by the ``pds_core`` driver. This +example assumes the pds_core and pds-vfio-pci modules are already +loaded. + +.. code-block:: bash + :name: example-setup-script + + #!/bin/bash + + PF_BUS="0000:60" + PF_BDF="0000:60:00.0" + VF_BDF="0000:60:00.1" + + # Prevent non-vfio VF driver from probing the VF device + echo 0 > /sys/class/pci_bus/$PF_BUS/device/$PF_BDF/sriov_drivers_autoprobe + + # Create single VF for Live Migration via pds_core + echo 1 > /sys/bus/pci/drivers/pds_core/$PF_BDF/sriov_numvfs + + # Allow the VF to be bound to the pds-vfio-pci driver + echo "pds-vfio-pci" > /sys/class/pci_bus/$PF_BUS/device/$VF_BDF/driver_override + + # Bind the VF to the pds-vfio-pci driver + echo "$VF_BDF" > /sys/bus/pci/drivers/pds-vfio-pci/bind + +After performing the steps above, a file in /dev/vfio/<iommu_group> +should have been created. + + +Enabling the driver +=================== + +The driver is enabled via the standard kernel configuration system, +using the make command:: + + make oldconfig/menuconfig/etc. + +The driver is located in the menu structure at: + + -> Device Drivers + -> VFIO Non-Privileged userspace driver framework + -> VFIO support for PDS PCI devices + +Support +======= + +For general Linux networking support, please use the netdev mailing +list, which is monitored by Pensando personnel:: + + netdev@vger.kernel.org + +For more specific support needs, please use the Pensando driver support +email:: + + drivers@pensando.io diff --git a/Documentation/networking/device_drivers/ethernet/index.rst b/Documentation/networking/device_drivers/ethernet/index.rst index 94ecb67c0885..9827e816084b 100644 --- a/Documentation/networking/device_drivers/ethernet/index.rst +++ b/Documentation/networking/device_drivers/ethernet/index.rst @@ -16,6 +16,7 @@ Contents: altera/altera_tse amd/pds_core amd/pds_vdpa + amd/pds_vfio_pci aquantia/atlantic chelsio/cxgb cirrus/cs89x0 diff --git a/Documentation/virt/kvm/devices/vfio.rst b/Documentation/virt/kvm/devices/vfio.rst index 08b544212638..c549143bb891 100644 --- a/Documentation/virt/kvm/devices/vfio.rst +++ b/Documentation/virt/kvm/devices/vfio.rst @@ -9,22 +9,34 @@ Device types supported: - KVM_DEV_TYPE_VFIO Only one VFIO instance may be created per VM. The created device -tracks VFIO groups in use by the VM and features of those groups -important to the correctness and acceleration of the VM. As groups -are enabled and disabled for use by the VM, KVM should be updated -about their presence. When registered with KVM, a reference to the -VFIO-group is held by KVM. +tracks VFIO files (group or device) in use by the VM and features +of those groups/devices important to the correctness and acceleration +of the VM. As groups/devices are enabled and disabled for use by the +VM, KVM should be updated about their presence. When registered with +KVM, a reference to the VFIO file is held by KVM. Groups: - KVM_DEV_VFIO_GROUP - -KVM_DEV_VFIO_GROUP attributes: - KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. - KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking - kvm_device_attr.addr points to an int32_t file descriptor - for the VFIO group. + KVM_DEV_VFIO_FILE + alias: KVM_DEV_VFIO_GROUP + +KVM_DEV_VFIO_FILE attributes: + KVM_DEV_VFIO_FILE_ADD: Add a VFIO file (group/device) to VFIO-KVM device + tracking + + kvm_device_attr.addr points to an int32_t file descriptor for the + VFIO file. + + KVM_DEV_VFIO_FILE_DEL: Remove a VFIO file (group/device) from VFIO-KVM + device tracking + + kvm_device_attr.addr points to an int32_t file descriptor for the + VFIO file. + +KVM_DEV_VFIO_GROUP (legacy kvm device group restricted to the handling of VFIO group fd): + KVM_DEV_VFIO_GROUP_ADD: same as KVM_DEV_VFIO_FILE_ADD for group fd only + + KVM_DEV_VFIO_GROUP_DEL: same as KVM_DEV_VFIO_FILE_DEL for group fd only + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table allocated by sPAPR KVM. kvm_device_attr.addr points to a struct:: @@ -40,7 +52,10 @@ KVM_DEV_VFIO_GROUP attributes: - @tablefd is a file descriptor for a TCE table allocated via KVM_CREATE_SPAPR_TCE. -The GROUP_ADD operation above should be invoked prior to accessing the +The FILE/GROUP_ADD operation above should be invoked prior to accessing the device file descriptor via VFIO_GROUP_GET_DEVICE_FD in order to support drivers which require a kvm pointer to be set in their .open_device() -callback. +callback. It is the same for device file descriptor via character device +open which gets device access via VFIO_DEVICE_BIND_IOMMUFD. For such file +descriptors, FILE_ADD should be invoked before VFIO_DEVICE_BIND_IOMMUFD +to support the drivers mentioned in prior sentence as well. diff --git a/MAINTAINERS b/MAINTAINERS index 43050d8fdfbc..4e3419c04f27 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22482,6 +22482,13 @@ S: Maintained P: Documentation/driver-api/vfio-pci-device-specific-driver-acceptance.rst F: drivers/vfio/pci/*/ +VFIO PDS PCI DRIVER +M: Brett Creeley <brett.creeley@amd.com> +L: kvm@vger.kernel.org +S: Maintained +F: Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst +F: drivers/vfio/pci/pds/ + VFIO PLATFORM DRIVER M: Eric Auger <eric.auger@redhat.com> L: kvm@vger.kernel.org diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index de675d799c7d..9cd9e9da60dd 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -1474,6 +1474,7 @@ static const struct vfio_device_ops intel_vgpu_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, }; static int intel_vgpu_probe(struct mdev_device *mdev) diff --git a/drivers/iommu/iommufd/Kconfig b/drivers/iommu/iommufd/Kconfig index ada693ea51a7..99d4b075df49 100644 --- a/drivers/iommu/iommufd/Kconfig +++ b/drivers/iommu/iommufd/Kconfig @@ -14,8 +14,8 @@ config IOMMUFD if IOMMUFD config IOMMUFD_VFIO_CONTAINER bool "IOMMUFD provides the VFIO container /dev/vfio/vfio" - depends on VFIO && !VFIO_CONTAINER - default VFIO && !VFIO_CONTAINER + depends on VFIO_GROUP && !VFIO_CONTAINER + default VFIO_GROUP && !VFIO_CONTAINER help IOMMUFD will provide /dev/vfio/vfio instead of VFIO. This relies on IOMMUFD providing compatibility emulation to give the same ioctls. diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index ed2937a4e196..1d49aad48c67 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -99,6 +99,36 @@ out_group_put: EXPORT_SYMBOL_NS_GPL(iommufd_device_bind, IOMMUFD); /** + * iommufd_ctx_has_group - True if any device within the group is bound + * to the ictx + * @ictx: iommufd file descriptor + * @group: Pointer to a physical iommu_group struct + * + * True if any device within the group has been bound to this ictx, ex. via + * iommufd_device_bind(), therefore implying ictx ownership of the group. + */ +bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group) +{ + struct iommufd_object *obj; + unsigned long index; + + if (!ictx || !group) + return false; + + xa_lock(&ictx->objects); + xa_for_each(&ictx->objects, index, obj) { + if (obj->type == IOMMUFD_OBJ_DEVICE && + container_of(obj, struct iommufd_device, obj)->group == group) { + xa_unlock(&ictx->objects); + return true; + } + } + xa_unlock(&ictx->objects); + return false; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_has_group, IOMMUFD); + +/** * iommufd_device_unbind - Undo iommufd_device_bind() * @idev: Device returned by iommufd_device_bind() * @@ -113,6 +143,18 @@ void iommufd_device_unbind(struct iommufd_device *idev) } EXPORT_SYMBOL_NS_GPL(iommufd_device_unbind, IOMMUFD); +struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev) +{ + return idev->ictx; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_ictx, IOMMUFD); + +u32 iommufd_device_to_id(struct iommufd_device *idev) +{ + return idev->obj.id; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_to_id, IOMMUFD); + static int iommufd_device_setup_msi(struct iommufd_device *idev, struct iommufd_hw_pagetable *hwpt, phys_addr_t sw_msi_start) @@ -441,6 +483,7 @@ iommufd_access_create(struct iommufd_ctx *ictx, iommufd_ctx_get(ictx); iommufd_object_finalize(ictx, &access->obj); *id = access->obj.id; + mutex_init(&access->ioas_lock); return access; } EXPORT_SYMBOL_NS_GPL(iommufd_access_create, IOMMUFD); @@ -457,26 +500,60 @@ void iommufd_access_destroy(struct iommufd_access *access) } EXPORT_SYMBOL_NS_GPL(iommufd_access_destroy, IOMMUFD); +void iommufd_access_detach(struct iommufd_access *access) +{ + struct iommufd_ioas *cur_ioas = access->ioas; + + mutex_lock(&access->ioas_lock); + if (WARN_ON(!access->ioas)) + goto out; + /* + * Set ioas to NULL to block any further iommufd_access_pin_pages(). + * iommufd_access_unpin_pages() can continue using access->ioas_unpin. + */ + access->ioas = NULL; + + if (access->ops->unmap) { + mutex_unlock(&access->ioas_lock); + access->ops->unmap(access->data, 0, ULONG_MAX); + mutex_lock(&access->ioas_lock); + } + iopt_remove_access(&cur_ioas->iopt, access); + refcount_dec(&cur_ioas->obj.users); +out: + access->ioas_unpin = NULL; + mutex_unlock(&access->ioas_lock); +} +EXPORT_SYMBOL_NS_GPL(iommufd_access_detach, IOMMUFD); + int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id) { struct iommufd_ioas *new_ioas; int rc = 0; - if (access->ioas) + mutex_lock(&access->ioas_lock); + if (WARN_ON(access->ioas || access->ioas_unpin)) { + mutex_unlock(&access->ioas_lock); return -EINVAL; + } new_ioas = iommufd_get_ioas(access->ictx, ioas_id); - if (IS_ERR(new_ioas)) + if (IS_ERR(new_ioas)) { + mutex_unlock(&access->ioas_lock); return PTR_ERR(new_ioas); + } rc = iopt_add_access(&new_ioas->iopt, access); if (rc) { + mutex_unlock(&access->ioas_lock); iommufd_put_object(&new_ioas->obj); return rc; } iommufd_ref_to_users(&new_ioas->obj); access->ioas = new_ioas; + access->ioas_unpin = new_ioas; + mutex_unlock(&access->ioas_lock); return 0; } EXPORT_SYMBOL_NS_GPL(iommufd_access_attach, IOMMUFD); @@ -531,8 +608,8 @@ void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, void iommufd_access_unpin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length) { - struct io_pagetable *iopt = &access->ioas->iopt; struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; unsigned long last_iova; struct iopt_area *area; @@ -540,6 +617,17 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, WARN_ON(check_add_overflow(iova, length - 1, &last_iova))) return; + mutex_lock(&access->ioas_lock); + /* + * The driver must be doing something wrong if it calls this before an + * iommufd_access_attach() or after an iommufd_access_detach(). + */ + if (WARN_ON(!access->ioas_unpin)) { + mutex_unlock(&access->ioas_lock); + return; + } + iopt = &access->ioas_unpin->iopt; + down_read(&iopt->iova_rwsem); iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) iopt_area_remove_access( @@ -549,6 +637,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access, min(last_iova, iopt_area_last_iova(area)))); WARN_ON(!iopt_area_contig_done(&iter)); up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); } EXPORT_SYMBOL_NS_GPL(iommufd_access_unpin_pages, IOMMUFD); @@ -594,8 +683,8 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length, struct page **out_pages, unsigned int flags) { - struct io_pagetable *iopt = &access->ioas->iopt; struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; unsigned long last_iova; struct iopt_area *area; int rc; @@ -610,6 +699,13 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, if (check_add_overflow(iova, length - 1, &last_iova)) return -EOVERFLOW; + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + iopt = &access->ioas->iopt; + down_read(&iopt->iova_rwsem); iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); @@ -640,6 +736,7 @@ int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, } up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); return 0; err_remove: @@ -654,6 +751,7 @@ err_remove: iopt_area_last_iova(area)))); } up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); @@ -673,8 +771,8 @@ EXPORT_SYMBOL_NS_GPL(iommufd_access_pin_pages, IOMMUFD); int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, void *data, size_t length, unsigned int flags) { - struct io_pagetable *iopt = &access->ioas->iopt; struct iopt_area_contig_iter iter; + struct io_pagetable *iopt; struct iopt_area *area; unsigned long last_iova; int rc; @@ -684,6 +782,13 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, if (check_add_overflow(iova, length - 1, &last_iova)) return -EOVERFLOW; + mutex_lock(&access->ioas_lock); + if (!access->ioas) { + mutex_unlock(&access->ioas_lock); + return -ENOENT; + } + iopt = &access->ioas->iopt; + down_read(&iopt->iova_rwsem); iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { unsigned long last = min(last_iova, iopt_area_last_iova(area)); @@ -710,6 +815,7 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova, rc = -ENOENT; err_out: up_read(&iopt->iova_rwsem); + mutex_unlock(&access->ioas_lock); return rc; } EXPORT_SYMBOL_NS_GPL(iommufd_access_rw, IOMMUFD); diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index f9790983699c..0349d7aea59a 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -296,6 +296,8 @@ struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_ioas *ioas; + struct iommufd_ioas *ioas_unpin; + struct mutex ioas_lock; const struct iommufd_access_ops *ops; void *data; unsigned long iova_alignment; diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 4cf5f73f2708..5f34be2a1f7f 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -50,7 +50,7 @@ struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, * before calling iommufd_object_finalize(). */ rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY, - xa_limit_32b, GFP_KERNEL_ACCOUNT); + xa_limit_31b, GFP_KERNEL_ACCOUNT); if (rc) goto out_free; return obj; @@ -418,6 +418,30 @@ struct iommufd_ctx *iommufd_ctx_from_file(struct file *file) EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_file, IOMMUFD); /** + * iommufd_ctx_from_fd - Acquires a reference to the iommufd context + * @fd: File descriptor to obtain the reference from + * + * Returns a pointer to the iommufd_ctx, otherwise ERR_PTR. On success + * the caller is responsible to call iommufd_ctx_put(). + */ +struct iommufd_ctx *iommufd_ctx_from_fd(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &iommufd_fops) { + fput(file); + return ERR_PTR(-EBADFD); + } + /* fget is the same as iommufd_ctx_get() */ + return file->private_data; +} +EXPORT_SYMBOL_NS_GPL(iommufd_ctx_from_fd, IOMMUFD); + +/** * iommufd_ctx_put - Put back a reference * @ictx: Context to put back */ diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c index fe02517c73cc..6c810bf80f99 100644 --- a/drivers/iommu/iommufd/vfio_compat.c +++ b/drivers/iommu/iommufd/vfio_compat.c @@ -483,6 +483,8 @@ static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx, rc = cap_size; goto out_put; } + cap_size = ALIGN(cap_size, sizeof(u64)); + if (last_cap && info.argsz >= total_cap_size && put_user(total_cap_size, &last_cap->next)) { rc = -EFAULT; diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c index 6787a5fae908..11c23a7f3172 100644 --- a/drivers/net/ethernet/amd/pds_core/auxbus.c +++ b/drivers/net/ethernet/amd/pds_core/auxbus.c @@ -8,24 +8,19 @@ /** * pds_client_register - Link the client to the firmware - * @pf_pdev: ptr to the PF driver struct + * @pf: ptr to the PF driver's private data struct * @devname: name that includes service into, e.g. pds_core.vDPA * * Return: positive client ID (ci) on success, or * negative for error */ -int pds_client_register(struct pci_dev *pf_pdev, char *devname) +int pds_client_register(struct pdsc *pf, char *devname) { union pds_core_adminq_comp comp = {}; union pds_core_adminq_cmd cmd = {}; - struct pdsc *pf; int err; u16 ci; - pf = pci_get_drvdata(pf_pdev); - if (pf->state) - return -ENXIO; - cmd.client_reg.opcode = PDS_AQ_CMD_CLIENT_REG; strscpy(cmd.client_reg.devname, devname, sizeof(cmd.client_reg.devname)); @@ -53,23 +48,18 @@ EXPORT_SYMBOL_GPL(pds_client_register); /** * pds_client_unregister - Unlink the client from the firmware - * @pf_pdev: ptr to the PF driver struct + * @pf: ptr to the PF driver's private data struct * @client_id: id returned from pds_client_register() * * Return: 0 on success, or * negative for error */ -int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id) +int pds_client_unregister(struct pdsc *pf, u16 client_id) { union pds_core_adminq_comp comp = {}; union pds_core_adminq_cmd cmd = {}; - struct pdsc *pf; int err; - pf = pci_get_drvdata(pf_pdev); - if (pf->state) - return -ENXIO; - cmd.client_unreg.opcode = PDS_AQ_CMD_CLIENT_UNREG; cmd.client_unreg.client_id = cpu_to_le16(client_id); @@ -198,7 +188,7 @@ int pdsc_auxbus_dev_del(struct pdsc *cf, struct pdsc *pf) padev = pf->vfs[cf->vf_id].padev; if (padev) { - pds_client_unregister(pf->pdev, padev->client_id); + pds_client_unregister(pf, padev->client_id); auxiliary_device_delete(&padev->aux_dev); auxiliary_device_uninit(&padev->aux_dev); padev->client_id = 0; @@ -243,7 +233,7 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf) */ snprintf(devname, sizeof(devname), "%s.%s.%d", PDS_CORE_DRV_NAME, pf->viftype_status[vt].name, cf->uid); - client_id = pds_client_register(pf->pdev, devname); + client_id = pds_client_register(pf, devname); if (client_id < 0) { err = client_id; goto out_unlock; @@ -252,7 +242,7 @@ int pdsc_auxbus_dev_add(struct pdsc *cf, struct pdsc *pf) padev = pdsc_auxbus_dev_register(cf, pf, client_id, pf->viftype_status[vt].name); if (IS_ERR(padev)) { - pds_client_unregister(pf->pdev, client_id); + pds_client_unregister(pf, client_id); err = PTR_ERR(padev); goto out_unlock; } diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c index 5b53b94f13c7..cba4971618ff 100644 --- a/drivers/s390/cio/vfio_ccw_ops.c +++ b/drivers/s390/cio/vfio_ccw_ops.c @@ -632,6 +632,7 @@ static const struct vfio_device_ops vfio_ccw_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, }; struct mdev_driver vfio_ccw_mdev_driver = { diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 0509f80622cd..4db538a55192 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -2020,6 +2020,7 @@ static const struct vfio_device_ops vfio_ap_matrix_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, .request = vfio_ap_mdev_request }; diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index aba36f5be4ec..6bda6dbb4878 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -4,6 +4,8 @@ menuconfig VFIO select IOMMU_API depends on IOMMUFD || !IOMMUFD select INTERVAL_TREE + select VFIO_GROUP if SPAPR_TCE_IOMMU || IOMMUFD=n + select VFIO_DEVICE_CDEV if !VFIO_GROUP select VFIO_CONTAINER if IOMMUFD=n help VFIO provides a framework for secure userspace device drivers. @@ -12,9 +14,33 @@ menuconfig VFIO If you don't know what to do here, say N. if VFIO +config VFIO_DEVICE_CDEV + bool "Support for the VFIO cdev /dev/vfio/devices/vfioX" + depends on IOMMUFD && !SPAPR_TCE_IOMMU + default !VFIO_GROUP + help + The VFIO device cdev is another way for userspace to get device + access. Userspace gets device fd by opening device cdev under + /dev/vfio/devices/vfioX, and then bind the device fd with an iommufd + to set up secure DMA context for device access. This interface does + not support noiommu. + + If you don't know what to do here, say N. + +config VFIO_GROUP + bool "Support for the VFIO group /dev/vfio/$group_id" + default y + help + VFIO group support provides the traditional model for accessing + devices through VFIO and is used by the majority of userspace + applications and drivers making use of VFIO. + + If you don't know what to do here, say Y. + config VFIO_CONTAINER bool "Support for the VFIO container /dev/vfio/vfio" select VFIO_IOMMU_TYPE1 if MMU && (X86 || S390 || ARM || ARM64) + depends on VFIO_GROUP default y help The VFIO container is the classic interface to VFIO for establishing @@ -36,6 +62,7 @@ endif config VFIO_NOIOMMU bool "VFIO No-IOMMU support" + depends on VFIO_GROUP help VFIO is built on the ability to isolate devices using the IOMMU. Only with an IOMMU can userspace access to DMA capable devices be diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 66f418aef5a9..c82ea032d352 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -2,8 +2,9 @@ obj-$(CONFIG_VFIO) += vfio.o vfio-y += vfio_main.o \ - group.o \ iova_bitmap.o +vfio-$(CONFIG_VFIO_DEVICE_CDEV) += device_cdev.o +vfio-$(CONFIG_VFIO_GROUP) += group.o vfio-$(CONFIG_IOMMUFD) += iommufd.o vfio-$(CONFIG_VFIO_CONTAINER) += container.o vfio-$(CONFIG_VFIO_VIRQFD) += virqfd.o diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index c376a69d2db2..de56686581ae 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -223,7 +223,6 @@ static struct cdx_driver vfio_cdx_driver = { .match_id_table = vfio_cdx_table, .driver = { .name = "vfio-cdx", - .owner = THIS_MODULE, }, .driver_managed_dma = true, }; diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c new file mode 100644 index 000000000000..e75da0a70d1f --- /dev/null +++ b/drivers/vfio/device_cdev.c @@ -0,0 +1,228 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023 Intel Corporation. + */ +#include <linux/vfio.h> +#include <linux/iommufd.h> + +#include "vfio.h" + +static dev_t device_devt; + +void vfio_init_device_cdev(struct vfio_device *device) +{ + device->device.devt = MKDEV(MAJOR(device_devt), device->index); + cdev_init(&device->cdev, &vfio_device_fops); + device->cdev.owner = THIS_MODULE; +} + +/* + * device access via the fd opened by this function is blocked until + * .open_device() is called successfully during BIND_IOMMUFD. + */ +int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep) +{ + struct vfio_device *device = container_of(inode->i_cdev, + struct vfio_device, cdev); + struct vfio_device_file *df; + int ret; + + /* Paired with the put in vfio_device_fops_release() */ + if (!vfio_device_try_get_registration(device)) + return -ENODEV; + + df = vfio_allocate_device_file(device); + if (IS_ERR(df)) { + ret = PTR_ERR(df); + goto err_put_registration; + } + + filep->private_data = df; + + return 0; + +err_put_registration: + vfio_device_put_registration(device); + return ret; +} + +static void vfio_df_get_kvm_safe(struct vfio_device_file *df) +{ + spin_lock(&df->kvm_ref_lock); + vfio_device_get_kvm_safe(df->device, df->kvm); + spin_unlock(&df->kvm_ref_lock); +} + +long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_bind_iommufd bind; + unsigned long minsz; + int ret; + + static_assert(__same_type(arg->out_devid, df->devid)); + + minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); + + if (copy_from_user(&bind, arg, minsz)) + return -EFAULT; + + if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + return -EINVAL; + + /* BIND_IOMMUFD only allowed for cdev fds */ + if (df->group) + return -EINVAL; + + ret = vfio_device_block_group(device); + if (ret) + return ret; + + mutex_lock(&device->dev_set->lock); + /* one device cannot be bound twice */ + if (df->access_granted) { + ret = -EINVAL; + goto out_unlock; + } + + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); + if (IS_ERR(df->iommufd)) { + ret = PTR_ERR(df->iommufd); + df->iommufd = NULL; + goto out_unlock; + } + + /* + * Before the device open, get the KVM pointer currently + * associated with the device file (if there is) and obtain + * a reference. This reference is held until device closed. + * Save the pointer in the device for use by drivers. + */ + vfio_df_get_kvm_safe(df); + + ret = vfio_df_open(df); + if (ret) + goto out_put_kvm; + + ret = copy_to_user(&arg->out_devid, &df->devid, + sizeof(df->devid)) ? -EFAULT : 0; + if (ret) + goto out_close_device; + + device->cdev_opened = true; + /* + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ + * read/write/mmap + */ + smp_store_release(&df->access_granted, true); + mutex_unlock(&device->dev_set->lock); + return 0; + +out_close_device: + vfio_df_close(df); +out_put_kvm: + vfio_device_put_kvm(device); + iommufd_ctx_put(df->iommufd); + df->iommufd = NULL; +out_unlock: + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); + return ret; +} + +void vfio_df_unbind_iommufd(struct vfio_device_file *df) +{ + struct vfio_device *device = df->device; + + /* + * In the time of close, there is no contention with another one + * changing this flag. So read df->access_granted without lock + * and no smp_load_acquire() is ok. + */ + if (!df->access_granted) + return; + + mutex_lock(&device->dev_set->lock); + vfio_df_close(df); + vfio_device_put_kvm(device); + iommufd_ctx_put(df->iommufd); + device->cdev_opened = false; + mutex_unlock(&device->dev_set->lock); + vfio_device_unblock_group(device); +} + +int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, + struct vfio_device_attach_iommufd_pt __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_attach_iommufd_pt attach; + unsigned long minsz; + int ret; + + minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); + + if (copy_from_user(&attach, arg, minsz)) + return -EFAULT; + + if (attach.argsz < minsz || attach.flags) + return -EINVAL; + + mutex_lock(&device->dev_set->lock); + ret = device->ops->attach_ioas(device, &attach.pt_id); + if (ret) + goto out_unlock; + + if (copy_to_user(&arg->pt_id, &attach.pt_id, sizeof(attach.pt_id))) { + ret = -EFAULT; + goto out_detach; + } + mutex_unlock(&device->dev_set->lock); + + return 0; + +out_detach: + device->ops->detach_ioas(device); +out_unlock: + mutex_unlock(&device->dev_set->lock); + return ret; +} + +int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, + struct vfio_device_detach_iommufd_pt __user *arg) +{ + struct vfio_device *device = df->device; + struct vfio_device_detach_iommufd_pt detach; + unsigned long minsz; + + minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); + + if (copy_from_user(&detach, arg, minsz)) + return -EFAULT; + + if (detach.argsz < minsz || detach.flags) + return -EINVAL; + + mutex_lock(&device->dev_set->lock); + device->ops->detach_ioas(device); + mutex_unlock(&device->dev_set->lock); + + return 0; +} + +static char *vfio_device_devnode(const struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vfio/devices/%s", dev_name(dev)); +} + +int vfio_cdev_init(struct class *device_class) +{ + device_class->devnode = vfio_device_devnode; + return alloc_chrdev_region(&device_devt, 0, + MINORMASK + 1, "vfio-dev"); +} + +void vfio_cdev_cleanup(void) +{ + unregister_chrdev_region(device_devt, MINORMASK + 1); +} diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index f2140e94d41e..f65d91c01f2e 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -593,6 +593,7 @@ static const struct vfio_device_ops vfio_fsl_mc_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static struct fsl_mc_driver vfio_fsl_mc_driver = { @@ -600,23 +601,11 @@ static struct fsl_mc_driver vfio_fsl_mc_driver = { .remove = vfio_fsl_mc_remove, .driver = { .name = "vfio-fsl-mc", - .owner = THIS_MODULE, }, .driver_managed_dma = true, }; -static int __init vfio_fsl_mc_driver_init(void) -{ - return fsl_mc_driver_register(&vfio_fsl_mc_driver); -} - -static void __exit vfio_fsl_mc_driver_exit(void) -{ - fsl_mc_driver_unregister(&vfio_fsl_mc_driver); -} - -module_init(vfio_fsl_mc_driver_init); -module_exit(vfio_fsl_mc_driver_exit); +module_fsl_mc_driver(vfio_fsl_mc_driver); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("VFIO for FSL-MC devices - User Level meta-driver"); diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index fc75c1000d74..610a429c6191 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -160,17 +160,13 @@ out_unlock: static void vfio_device_group_get_kvm_safe(struct vfio_device *device) { spin_lock(&device->group->kvm_ref_lock); - if (!device->group->kvm) - goto unlock; - - _vfio_device_get_kvm_safe(device, device->group->kvm); - -unlock: + vfio_device_get_kvm_safe(device, device->group->kvm); spin_unlock(&device->group->kvm_ref_lock); } -static int vfio_device_group_open(struct vfio_device *device) +static int vfio_df_group_open(struct vfio_device_file *df) { + struct vfio_device *device = df->device; int ret; mutex_lock(&device->group->group_lock); @@ -190,24 +186,62 @@ static int vfio_device_group_open(struct vfio_device *device) if (device->open_count == 0) vfio_device_group_get_kvm_safe(device); - ret = vfio_device_open(device, device->group->iommufd); + df->iommufd = device->group->iommufd; + if (df->iommufd && vfio_device_is_noiommu(device) && device->open_count == 0) { + /* + * Require no compat ioas to be assigned to proceed. The basic + * statement is that the user cannot have done something that + * implies they expected translation to exist + */ + if (!capable(CAP_SYS_RAWIO) || + vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) + ret = -EPERM; + else + ret = 0; + goto out_put_kvm; + } - if (device->open_count == 0) - vfio_device_put_kvm(device); + ret = vfio_df_open(df); + if (ret) + goto out_put_kvm; + + if (df->iommufd && device->open_count == 1) { + ret = vfio_iommufd_compat_attach_ioas(device, df->iommufd); + if (ret) + goto out_close_device; + } + + /* + * Paired with smp_load_acquire() in vfio_device_fops::ioctl/ + * read/write/mmap and vfio_file_has_device_access() + */ + smp_store_release(&df->access_granted, true); mutex_unlock(&device->dev_set->lock); + mutex_unlock(&device->group->group_lock); + return 0; +out_close_device: + vfio_df_close(df); +out_put_kvm: + df->iommufd = NULL; + if (device->open_count == 0) + vfio_device_put_kvm(device); + mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); return ret; } -void vfio_device_group_close(struct vfio_device *device) +void vfio_df_group_close(struct vfio_device_file *df) { + struct vfio_device *device = df->device; + mutex_lock(&device->group->group_lock); mutex_lock(&device->dev_set->lock); - vfio_device_close(device, device->group->iommufd); + vfio_df_close(df); + df->iommufd = NULL; if (device->open_count == 0) vfio_device_put_kvm(device); @@ -218,19 +252,28 @@ void vfio_device_group_close(struct vfio_device *device) static struct file *vfio_device_open_file(struct vfio_device *device) { + struct vfio_device_file *df; struct file *filep; int ret; - ret = vfio_device_group_open(device); - if (ret) + df = vfio_allocate_device_file(device); + if (IS_ERR(df)) { + ret = PTR_ERR(df); goto err_out; + } + + df->group = device->group; + + ret = vfio_df_group_open(df); + if (ret) + goto err_free; /* * We can't use anon_inode_getfd() because we need to modify * the f_mode flags directly to allow more than just ioctls */ filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - device, O_RDWR); + df, O_RDWR); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; @@ -253,7 +296,9 @@ static struct file *vfio_device_open_file(struct vfio_device *device) return filep; err_close_device: - vfio_device_group_close(device); + vfio_df_group_close(df); +err_free: + kfree(df); err_out: return ERR_PTR(ret); } @@ -357,6 +402,33 @@ static long vfio_group_fops_unl_ioctl(struct file *filep, } } +int vfio_device_block_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + int ret = 0; + + mutex_lock(&group->group_lock); + if (group->opened_file) { + ret = -EBUSY; + goto out_unlock; + } + + group->cdev_device_open_cnt++; + +out_unlock: + mutex_unlock(&group->group_lock); + return ret; +} + +void vfio_device_unblock_group(struct vfio_device *device) +{ + struct vfio_group *group = device->group; + + mutex_lock(&group->group_lock); + group->cdev_device_open_cnt--; + mutex_unlock(&group->group_lock); +} + static int vfio_group_fops_open(struct inode *inode, struct file *filep) { struct vfio_group *group = @@ -379,6 +451,11 @@ static int vfio_group_fops_open(struct inode *inode, struct file *filep) goto out_unlock; } + if (group->cdev_device_open_cnt) { + ret = -EBUSY; + goto out_unlock; + } + /* * Do we need multiple instances of the group open? Seems not. */ @@ -453,6 +530,7 @@ static void vfio_group_release(struct device *dev) mutex_destroy(&group->device_lock); mutex_destroy(&group->group_lock); WARN_ON(group->iommu_group); + WARN_ON(group->cdev_device_open_cnt); ida_free(&vfio.group_ida, MINOR(group->dev.devt)); kfree(group); } @@ -604,16 +682,6 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev) if (!iommu_group) return ERR_PTR(-EINVAL); - /* - * VFIO always sets IOMMU_CACHE because we offer no way for userspace to - * restore cache coherency. It has to be checked here because it is only - * valid for cases where we are using iommu groups. - */ - if (!device_iommu_capable(dev, IOMMU_CAP_CACHE_COHERENCY)) { - iommu_group_put(iommu_group); - return ERR_PTR(-EINVAL); - } - mutex_lock(&vfio.group_lock); group = vfio_group_find_from_iommu(iommu_group); if (group) { @@ -745,6 +813,15 @@ bool vfio_device_has_container(struct vfio_device *device) return device->group->container; } +struct vfio_group *vfio_group_from_file(struct file *file) +{ + struct vfio_group *group = file->private_data; + + if (file->f_op != &vfio_group_fops) + return NULL; + return group; +} + /** * vfio_file_iommu_group - Return the struct iommu_group for the vfio group file * @file: VFIO group file @@ -755,13 +832,13 @@ bool vfio_device_has_container(struct vfio_device *device) */ struct iommu_group *vfio_file_iommu_group(struct file *file) { - struct vfio_group *group = file->private_data; + struct vfio_group *group = vfio_group_from_file(file); struct iommu_group *iommu_group = NULL; if (!IS_ENABLED(CONFIG_SPAPR_TCE_IOMMU)) return NULL; - if (!vfio_file_is_group(file)) + if (!group) return NULL; mutex_lock(&group->group_lock); @@ -775,33 +852,20 @@ struct iommu_group *vfio_file_iommu_group(struct file *file) EXPORT_SYMBOL_GPL(vfio_file_iommu_group); /** - * vfio_file_is_group - True if the file is usable with VFIO aPIS + * vfio_file_is_group - True if the file is a vfio group file * @file: VFIO group file */ bool vfio_file_is_group(struct file *file) { - return file->f_op == &vfio_group_fops; + return vfio_group_from_file(file); } EXPORT_SYMBOL_GPL(vfio_file_is_group); -/** - * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file - * is always CPU cache coherent - * @file: VFIO group file - * - * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop - * bit in DMA transactions. A return of false indicates that the user has - * rights to access additional instructions such as wbinvd on x86. - */ -bool vfio_file_enforced_coherent(struct file *file) +bool vfio_group_enforced_coherent(struct vfio_group *group) { - struct vfio_group *group = file->private_data; struct vfio_device *device; bool ret = true; - if (!vfio_file_is_group(file)) - return true; - /* * If the device does not have IOMMU_CAP_ENFORCE_CACHE_COHERENCY then * any domain later attached to it will also not support it. If the cap @@ -819,28 +883,13 @@ bool vfio_file_enforced_coherent(struct file *file) mutex_unlock(&group->device_lock); return ret; } -EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); -/** - * vfio_file_set_kvm - Link a kvm with VFIO drivers - * @file: VFIO group file - * @kvm: KVM to link - * - * When a VFIO device is first opened the KVM will be available in - * device->kvm if one was associated with the group. - */ -void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) { - struct vfio_group *group = file->private_data; - - if (!vfio_file_is_group(file)) - return; - spin_lock(&group->kvm_ref_lock); group->kvm = kvm; spin_unlock(&group->kvm_ref_lock); } -EXPORT_SYMBOL_GPL(vfio_file_set_kvm); /** * vfio_file_has_dev - True if the VFIO file is a handle for device @@ -851,9 +900,9 @@ EXPORT_SYMBOL_GPL(vfio_file_set_kvm); */ bool vfio_file_has_dev(struct file *file, struct vfio_device *device) { - struct vfio_group *group = file->private_data; + struct vfio_group *group = vfio_group_from_file(file); - if (!vfio_file_is_group(file)) + if (!group) return false; return group == device->group; diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 88b00c501015..4d84904fd927 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -10,53 +10,48 @@ MODULE_IMPORT_NS(IOMMUFD); MODULE_IMPORT_NS(IOMMUFD_VFIO); -int vfio_iommufd_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx) +bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, + struct iommufd_ctx *ictx) +{ + u32 ioas_id; + + return !iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id); +} + +int vfio_df_iommufd_bind(struct vfio_device_file *df) +{ + struct vfio_device *vdev = df->device; + struct iommufd_ctx *ictx = df->iommufd; + + lockdep_assert_held(&vdev->dev_set->lock); + + return vdev->ops->bind_iommufd(vdev, ictx, &df->devid); +} + +int vfio_iommufd_compat_attach_ioas(struct vfio_device *vdev, + struct iommufd_ctx *ictx) { u32 ioas_id; - u32 device_id; int ret; lockdep_assert_held(&vdev->dev_set->lock); - if (vfio_device_is_noiommu(vdev)) { - if (!capable(CAP_SYS_RAWIO)) - return -EPERM; - - /* - * Require no compat ioas to be assigned to proceed. The basic - * statement is that the user cannot have done something that - * implies they expected translation to exist - */ - if (!iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id)) - return -EPERM; + /* compat noiommu does not need to do ioas attach */ + if (vfio_device_is_noiommu(vdev)) return 0; - } - - ret = vdev->ops->bind_iommufd(vdev, ictx, &device_id); - if (ret) - return ret; ret = iommufd_vfio_compat_ioas_get_id(ictx, &ioas_id); if (ret) - goto err_unbind; - ret = vdev->ops->attach_ioas(vdev, &ioas_id); - if (ret) - goto err_unbind; - - /* - * The legacy path has no way to return the device id or the selected - * pt_id - */ - return 0; + return ret; -err_unbind: - if (vdev->ops->unbind_iommufd) - vdev->ops->unbind_iommufd(vdev); - return ret; + /* The legacy path has no way to return the selected pt_id */ + return vdev->ops->attach_ioas(vdev, &ioas_id); } -void vfio_iommufd_unbind(struct vfio_device *vdev) +void vfio_df_iommufd_unbind(struct vfio_device_file *df) { + struct vfio_device *vdev = df->device; + lockdep_assert_held(&vdev->dev_set->lock); if (vfio_device_is_noiommu(vdev)) @@ -66,6 +61,50 @@ void vfio_iommufd_unbind(struct vfio_device *vdev) vdev->ops->unbind_iommufd(vdev); } +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev) +{ + if (vdev->iommufd_device) + return iommufd_device_to_ictx(vdev->iommufd_device); + return NULL; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_device_ictx); + +static int vfio_iommufd_device_id(struct vfio_device *vdev) +{ + if (vdev->iommufd_device) + return iommufd_device_to_id(vdev->iommufd_device); + return -EINVAL; +} + +/* + * Return devid for a device. + * valid ID for the device that is owned by the ictx + * -ENOENT = device is owned but there is no ID + * -ENODEV or other error = device is not owned + */ +int vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) +{ + struct iommu_group *group; + int devid; + + if (vfio_iommufd_device_ictx(vdev) == ictx) + return vfio_iommufd_device_id(vdev); + + group = iommu_group_get(vdev->dev); + if (!group) + return -ENODEV; + + if (iommufd_ctx_has_group(ictx, group)) + devid = -ENOENT; + else + devid = -ENODEV; + + iommu_group_put(group); + + return devid; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_get_dev_id); + /* * The physical standard ops mean that the iommufd_device is bound to the * physical device vdev->dev that was provided to vfio_init_group_dev(). Drivers @@ -101,6 +140,14 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) { int rc; + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return -EINVAL; + + if (vdev->iommufd_attached) + return -EBUSY; + rc = iommufd_device_attach(vdev->iommufd_device, pt_id); if (rc) return rc; @@ -109,6 +156,18 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_attach_ioas); +void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached) + return; + + iommufd_device_detach(vdev->iommufd_device); + vdev->iommufd_attached = false; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); + /* * The emulated standard ops mean that vfio_device is going to use the * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this @@ -172,3 +231,16 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return 0; } EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_attach_ioas); + +void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_access) || + !vdev->iommufd_attached) + return; + + iommufd_access_detach(vdev->iommufd_access); + vdev->iommufd_attached = false; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_emulated_detach_ioas); diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 86bb7835cf3c..8125e5f37832 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -63,4 +63,6 @@ source "drivers/vfio/pci/mlx5/Kconfig" source "drivers/vfio/pci/hisilicon/Kconfig" +source "drivers/vfio/pci/pds/Kconfig" + endmenu diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index 24c524224da5..45167be462d8 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -11,3 +11,5 @@ obj-$(CONFIG_VFIO_PCI) += vfio-pci.o obj-$(CONFIG_MLX5_VFIO_PCI) += mlx5/ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/ + +obj-$(CONFIG_PDS_VFIO_PCI) += pds/ diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index a117eaf21c14..b2f9778c8366 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -1373,6 +1373,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_migrn_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { @@ -1391,6 +1392,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c82c1f4fc588..33574b04477d 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -732,52 +732,6 @@ void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf) mlx5vf_cmd_dealloc_pd(migf); } -static void combine_ranges(struct rb_root_cached *root, u32 cur_nodes, - u32 req_nodes) -{ - struct interval_tree_node *prev, *curr, *comb_start, *comb_end; - unsigned long min_gap; - unsigned long curr_gap; - - /* Special shortcut when a single range is required */ - if (req_nodes == 1) { - unsigned long last; - - curr = comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); - while (curr) { - last = curr->last; - prev = curr; - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); - if (prev != comb_start) - interval_tree_remove(prev, root); - } - comb_start->last = last; - return; - } - - /* Combine ranges which have the smallest gap */ - while (cur_nodes > req_nodes) { - prev = NULL; - min_gap = ULONG_MAX; - curr = interval_tree_iter_first(root, 0, ULONG_MAX); - while (curr) { - if (prev) { - curr_gap = curr->start - prev->last; - if (curr_gap < min_gap) { - min_gap = curr_gap; - comb_start = prev; - comb_end = curr; - } - } - prev = curr; - curr = interval_tree_iter_next(curr, 0, ULONG_MAX); - } - comb_start->last = comb_end->last; - interval_tree_remove(comb_end, root); - cur_nodes--; - } -} - static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, struct mlx5vf_pci_core_device *mvdev, struct rb_root_cached *ranges, u32 nnodes) @@ -800,7 +754,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, int i; if (num_ranges > max_num_range) { - combine_ranges(ranges, nnodes, max_num_range); + vfio_combine_iova_ranges(ranges, nnodes, max_num_range); num_ranges = max_num_range; } diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index d95fd382814c..42ec574a8622 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -1320,6 +1320,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int mlx5vf_pci_probe(struct pci_dev *pdev, diff --git a/drivers/vfio/pci/pds/Kconfig b/drivers/vfio/pci/pds/Kconfig new file mode 100644 index 000000000000..407b3fd32733 --- /dev/null +++ b/drivers/vfio/pci/pds/Kconfig @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2023 Advanced Micro Devices, Inc. + +config PDS_VFIO_PCI + tristate "VFIO support for PDS PCI devices" + depends on PDS_CORE + select VFIO_PCI_CORE + help + This provides generic PCI support for PDS devices using the VFIO + framework. + + More specific information on this driver can be + found in + <file:Documentation/networking/device_drivers/ethernet/amd/pds_vfio_pci.rst>. + + To compile this driver as a module, choose M here. The module + will be called pds-vfio-pci. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/pds/Makefile b/drivers/vfio/pci/pds/Makefile new file mode 100644 index 000000000000..d5a06d81634f --- /dev/null +++ b/drivers/vfio/pci/pds/Makefile @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0 +# Copyright (c) 2023 Advanced Micro Devices, Inc. + +obj-$(CONFIG_PDS_VFIO_PCI) += pds-vfio-pci.o + +pds-vfio-pci-y := \ + cmds.o \ + dirty.o \ + lm.o \ + pci_drv.o \ + vfio_dev.o diff --git a/drivers/vfio/pci/pds/cmds.c b/drivers/vfio/pci/pds/cmds.c new file mode 100644 index 000000000000..36463ccc3df9 --- /dev/null +++ b/drivers/vfio/pci/pds/cmds.c @@ -0,0 +1,510 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/io.h> +#include <linux/types.h> +#include <linux/delay.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "cmds.h" + +#define SUSPEND_TIMEOUT_S 5 +#define SUSPEND_CHECK_INTERVAL_MS 1 + +static int pds_vfio_client_adminq_cmd(struct pds_vfio_pci_device *pds_vfio, + union pds_core_adminq_cmd *req, + union pds_core_adminq_comp *resp, + bool fast_poll) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + union pds_core_adminq_cmd cmd = {}; + struct pdsc *pdsc; + int err; + + /* Wrap the client request */ + cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD; + cmd.client_request.client_id = cpu_to_le16(pds_vfio->client_id); + memcpy(cmd.client_request.client_cmd, req, + sizeof(cmd.client_request.client_cmd)); + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return PTR_ERR(pdsc); + + err = pdsc_adminq_post(pdsc, &cmd, resp, fast_poll); + if (err && err != -EAGAIN) + dev_err(pds_vfio_to_dev(pds_vfio), + "client admin cmd failed: %pe\n", ERR_PTR(err)); + + return err; +} + +int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + char devname[PDS_DEVNAME_LEN]; + struct pdsc *pdsc; + int ci; + + snprintf(devname, sizeof(devname), "%s.%d-%u", PDS_VFIO_LM_DEV_NAME, + pci_domain_nr(pdev->bus), + PCI_DEVID(pdev->bus->number, pdev->devfn)); + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return PTR_ERR(pdsc); + + ci = pds_client_register(pdsc, devname); + if (ci < 0) + return ci; + + pds_vfio->client_id = ci; + + return 0; +} + +void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct pdsc *pdsc; + int err; + + pdsc = pdsc_get_pf_struct(pdev); + if (IS_ERR(pdsc)) + return; + + err = pds_client_unregister(pdsc, pds_vfio->client_id); + if (err) + dev_err(&pdev->dev, "unregister from DSC failed: %pe\n", + ERR_PTR(err)); + + pds_vfio->client_id = 0; +} + +static int +pds_vfio_suspend_wait_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_suspend_status = { + .opcode = PDS_LM_CMD_SUSPEND_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + unsigned long time_limit; + unsigned long time_start; + unsigned long time_done; + int err; + + time_start = jiffies; + time_limit = time_start + HZ * SUSPEND_TIMEOUT_S; + do { + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); + if (err != -EAGAIN) + break; + + msleep(SUSPEND_CHECK_INTERVAL_MS); + } while (time_before(jiffies, time_limit)); + + time_done = jiffies; + dev_dbg(dev, "%s: vf%u: Suspend comp received in %d msecs\n", __func__, + pds_vfio->vf_id, jiffies_to_msecs(time_done - time_start)); + + /* Check the results */ + if (time_after_eq(time_done, time_limit)) { + dev_err(dev, "%s: vf%u: Suspend comp timeout\n", __func__, + pds_vfio->vf_id); + err = -ETIMEDOUT; + } + + return err; +} + +int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_suspend = { + .opcode = PDS_LM_CMD_SUSPEND, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Suspend device\n", pds_vfio->vf_id); + + /* + * The initial suspend request to the firmware starts the device suspend + * operation and the firmware returns success if it's started + * successfully. + */ + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); + if (err) { + dev_err(dev, "vf%u: Suspend failed: %pe\n", pds_vfio->vf_id, + ERR_PTR(err)); + return err; + } + + /* + * The subsequent suspend status request(s) check if the firmware has + * completed the device suspend process. + */ + return pds_vfio_suspend_wait_device_cmd(pds_vfio, type); +} + +int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type) +{ + union pds_core_adminq_cmd cmd = { + .lm_resume = { + .opcode = PDS_LM_CMD_RESUME, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .type = type, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + + dev_dbg(dev, "vf%u: Resume device\n", pds_vfio->vf_id); + + return pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, true); +} + +int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size) +{ + union pds_core_adminq_cmd cmd = { + .lm_state_size = { + .opcode = PDS_LM_CMD_STATE_SIZE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Get migration status\n", pds_vfio->vf_id); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + return err; + + *size = le64_to_cpu(comp.lm_state_size.size); + return 0; +} + +static int pds_vfio_dma_map_lm_file(struct device *dev, + enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + struct pds_lm_sg_elem *sgl, *sge; + struct scatterlist *sg; + dma_addr_t sgl_addr; + size_t sgl_size; + int err; + int i; + + if (!lm_file) + return -EINVAL; + + /* dma map file pages */ + err = dma_map_sgtable(dev, &lm_file->sg_table, dir, 0); + if (err) + return err; + + lm_file->num_sge = lm_file->sg_table.nents; + + /* alloc sgl */ + sgl_size = lm_file->num_sge * sizeof(struct pds_lm_sg_elem); + sgl = kzalloc(sgl_size, GFP_KERNEL); + if (!sgl) { + err = -ENOMEM; + goto out_unmap_sgtable; + } + + /* fill sgl */ + sge = sgl; + for_each_sgtable_dma_sg(&lm_file->sg_table, sg, i) { + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->len = cpu_to_le32(sg_dma_len(sg)); + dev_dbg(dev, "addr = %llx, len = %u\n", sge->addr, sge->len); + sge++; + } + + sgl_addr = dma_map_single(dev, sgl, sgl_size, DMA_TO_DEVICE); + if (dma_mapping_error(dev, sgl_addr)) { + err = -EIO; + goto out_free_sgl; + } + + lm_file->sgl = sgl; + lm_file->sgl_addr = sgl_addr; + + return 0; + +out_free_sgl: + kfree(sgl); +out_unmap_sgtable: + lm_file->num_sge = 0; + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); + return err; +} + +static void pds_vfio_dma_unmap_lm_file(struct device *dev, + enum dma_data_direction dir, + struct pds_vfio_lm_file *lm_file) +{ + if (!lm_file) + return; + + /* free sgl */ + if (lm_file->sgl) { + dma_unmap_single(dev, lm_file->sgl_addr, + lm_file->num_sge * sizeof(*lm_file->sgl), + DMA_TO_DEVICE); + kfree(lm_file->sgl); + lm_file->sgl = NULL; + lm_file->sgl_addr = DMA_MAPPING_ERROR; + lm_file->num_sge = 0; + } + + /* dma unmap file pages */ + dma_unmap_sgtable(dev, &lm_file->sg_table, dir, 0); +} + +int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_save = { + .opcode = PDS_LM_CMD_SAVE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + union pds_core_adminq_comp comp = {}; + struct pds_vfio_lm_file *lm_file; + int err; + + dev_dbg(&pdev->dev, "vf%u: Get migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->save_file; + + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, "failed to map save migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.lm_save.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.lm_save.num_sge = cpu_to_le32(lm_file->num_sge); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_err(&pdev->dev, "failed to get migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_FROM_DEVICE, lm_file); + + return err; +} + +int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_restore = { + .opcode = PDS_LM_CMD_RESTORE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct pci_dev *pdev = pds_vfio_to_pci_dev(pds_vfio); + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + union pds_core_adminq_comp comp = {}; + struct pds_vfio_lm_file *lm_file; + int err; + + dev_dbg(&pdev->dev, "vf%u: Set migration state\n", pds_vfio->vf_id); + + lm_file = pds_vfio->restore_file; + + err = pds_vfio_dma_map_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); + if (err) { + dev_err(&pdev->dev, + "failed to map restore migration file: %pe\n", + ERR_PTR(err)); + return err; + } + + cmd.lm_restore.sgl_addr = cpu_to_le64(lm_file->sgl_addr); + cmd.lm_restore.num_sge = cpu_to_le32(lm_file->num_sge); + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_err(&pdev->dev, "failed to set migration state: %pe\n", + ERR_PTR(err)); + + pds_vfio_dma_unmap_lm_file(pdsc_dev, DMA_TO_DEVICE, lm_file); + + return err; +} + +void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status) +{ + union pds_core_adminq_cmd cmd = { + .lm_host_vf_status = { + .opcode = PDS_LM_CMD_HOST_VF_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .status = vf_status, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Set host VF LM status: %u", pds_vfio->vf_id, + vf_status); + if (vf_status != PDS_LM_STA_IN_PROGRESS && + vf_status != PDS_LM_STA_NONE) { + dev_warn(dev, "Invalid host VF migration status, %d\n", + vf_status); + return; + } + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) + dev_warn(dev, "failed to send host VF migration status: %pe\n", + ERR_PTR(err)); +} + +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, u8 *num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_status = { + .opcode = PDS_LM_CMD_DIRTY_STATUS, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + dev_dbg(dev, "vf%u: Dirty status\n", pds_vfio->vf_id); + + cmd.lm_dirty_status.regions_dma = cpu_to_le64(regions_dma); + cmd.lm_dirty_status.max_regions = *max_regions; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed to get dirty status: %pe\n", ERR_PTR(err)); + return err; + } + + /* only support seq_ack approach for now */ + if (!(le32_to_cpu(comp.lm_dirty_status.bmp_type_mask) & + BIT(PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK))) { + dev_err(dev, "Dirty bitmap tracking SEQ_ACK not supported\n"); + return -EOPNOTSUPP; + } + + *num_regions = comp.lm_dirty_status.num_regions; + *max_regions = comp.lm_dirty_status.max_regions; + + dev_dbg(dev, + "Page Tracking Status command successful, max_regions: %d, num_regions: %d, bmp_type: %s\n", + *max_regions, *num_regions, "PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK"); + + return 0; +} + +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_enable = { + .opcode = PDS_LM_CMD_DIRTY_ENABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .regions_dma = cpu_to_le64(regions_dma), + .bmp_type = PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK, + .num_regions = num_regions, + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed dirty tracking enable: %pe\n", + ERR_PTR(err)); + return err; + } + + return 0; +} + +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio) +{ + union pds_core_adminq_cmd cmd = { + .lm_dirty_disable = { + .opcode = PDS_LM_CMD_DIRTY_DISABLE, + .vf_id = cpu_to_le16(pds_vfio->vf_id), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err || comp.lm_dirty_status.num_regions != 0) { + /* in case num_regions is still non-zero after disable */ + err = err ? err : -EIO; + dev_err(dev, + "failed dirty tracking disable: %pe, num_regions %d\n", + ERR_PTR(err), comp.lm_dirty_status.num_regions); + return err; + } + + return 0; +} + +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq) +{ + const char *cmd_type_str = read_seq ? "read_seq" : "write_ack"; + union pds_core_adminq_cmd cmd = { + .lm_dirty_seq_ack = { + .vf_id = cpu_to_le16(pds_vfio->vf_id), + .len_bytes = cpu_to_le32(total_len), + .off_bytes = cpu_to_le32(offset), + .sgl_addr = cpu_to_le64(sgl_dma), + .num_sge = cpu_to_le16(num_sge), + }, + }; + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_adminq_comp comp = {}; + int err; + + if (read_seq) + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_READ_SEQ; + else + cmd.lm_dirty_seq_ack.opcode = PDS_LM_CMD_DIRTY_WRITE_ACK; + + err = pds_vfio_client_adminq_cmd(pds_vfio, &cmd, &comp, false); + if (err) { + dev_err(dev, "failed cmd Page Tracking %s: %pe\n", cmd_type_str, + ERR_PTR(err)); + return err; + } + + return 0; +} diff --git a/drivers/vfio/pci/pds/cmds.h b/drivers/vfio/pci/pds/cmds.h new file mode 100644 index 000000000000..95221100b954 --- /dev/null +++ b/drivers/vfio/pci/pds/cmds.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _CMDS_H_ +#define _CMDS_H_ + +int pds_vfio_register_client_cmd(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_unregister_client_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_suspend_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); +int pds_vfio_resume_device_cmd(struct pds_vfio_pci_device *pds_vfio, u8 type); +int pds_vfio_get_lm_state_size_cmd(struct pds_vfio_pci_device *pds_vfio, u64 *size); +int pds_vfio_get_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_set_lm_state_cmd(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_send_host_vf_lm_status_cmd(struct pds_vfio_pci_device *pds_vfio, + enum pds_lm_host_vf_status vf_status); +int pds_vfio_dirty_status_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 *max_regions, + u8 *num_regions); +int pds_vfio_dirty_enable_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 regions_dma, u8 num_regions); +int pds_vfio_dirty_disable_cmd(struct pds_vfio_pci_device *pds_vfio); +int pds_vfio_dirty_seq_ack_cmd(struct pds_vfio_pci_device *pds_vfio, + u64 sgl_dma, u16 num_sge, u32 offset, + u32 total_len, bool read_seq); +#endif /* _CMDS_H_ */ diff --git a/drivers/vfio/pci/pds/dirty.c b/drivers/vfio/pci/pds/dirty.c new file mode 100644 index 000000000000..c937aa6f3954 --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.c @@ -0,0 +1,564 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/interval_tree.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "cmds.h" +#include "dirty.h" + +#define READ_SEQ true +#define WRITE_ACK false + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + return pds_vfio->dirty.is_enabled; +} + +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = true; +} + +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio) +{ + pds_vfio->dirty.is_enabled = false; +} + +static void +pds_vfio_print_guest_region_info(struct pds_vfio_pci_device *pds_vfio, + u8 max_regions) +{ + int len = max_regions * sizeof(struct pds_lm_dirty_region_info); + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_dirty_region_info *region_info; + dma_addr_t regions_dma; + u8 num_regions; + int err; + + region_info = kcalloc(max_regions, + sizeof(struct pds_lm_dirty_region_info), + GFP_KERNEL); + if (!region_info) + return; + + regions_dma = + dma_map_single(pdsc_dev, region_info, len, DMA_FROM_DEVICE); + if (dma_mapping_error(pdsc_dev, regions_dma)) + goto out_free_region_info; + + err = pds_vfio_dirty_status_cmd(pds_vfio, regions_dma, &max_regions, + &num_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_FROM_DEVICE); + if (err) + goto out_free_region_info; + + for (unsigned int i = 0; i < num_regions; i++) + dev_dbg(&pdev->dev, + "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n", + i, le64_to_cpu(region_info[i].dma_base), + le32_to_cpu(region_info[i].page_count), + region_info[i].page_size_log2); + +out_free_region_info: + kfree(region_info); +} + +static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_dirty *dirty, + unsigned long bytes) +{ + unsigned long *host_seq_bmp, *host_ack_bmp; + + host_seq_bmp = vzalloc(bytes); + if (!host_seq_bmp) + return -ENOMEM; + + host_ack_bmp = vzalloc(bytes); + if (!host_ack_bmp) { + bitmap_free(host_seq_bmp); + return -ENOMEM; + } + + dirty->host_seq.bmp = host_seq_bmp; + dirty->host_ack.bmp = host_ack_bmp; + + return 0; +} + +static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty *dirty) +{ + vfree(dirty->host_seq.bmp); + vfree(dirty->host_ack.bmp); + dirty->host_seq.bmp = NULL; + dirty->host_ack.bmp = NULL; +} + +static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + + dma_unmap_single(pdsc_dev, bmp_info->sgl_addr, + bmp_info->num_sge * sizeof(struct pds_lm_sg_elem), + DMA_BIDIRECTIONAL); + kfree(bmp_info->sgl); + + bmp_info->num_sge = 0; + bmp_info->sgl = NULL; + bmp_info->sgl_addr = 0; +} + +static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device *pds_vfio) +{ + if (pds_vfio->dirty.host_seq.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_seq); + if (pds_vfio->dirty.host_ack.sgl) + __pds_vfio_dirty_free_sgl(pds_vfio, &pds_vfio->dirty.host_ack); +} + +static int __pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 page_count) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + size_t sgl_size; + u32 max_sge; + + max_sge = DIV_ROUND_UP(page_count, PAGE_SIZE * 8); + sgl_size = max_sge * sizeof(struct pds_lm_sg_elem); + + sgl = kzalloc(sgl_size, GFP_KERNEL); + if (!sgl) + return -ENOMEM; + + sgl_addr = dma_map_single(pdsc_dev, sgl, sgl_size, DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, sgl_addr)) { + kfree(sgl); + return -EIO; + } + + bmp_info->sgl = sgl; + bmp_info->num_sge = max_sge; + bmp_info->sgl_addr = sgl_addr; + + return 0; +} + +static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device *pds_vfio, + u32 page_count) +{ + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + int err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_seq, + page_count); + if (err) + return err; + + err = __pds_vfio_dirty_alloc_sgl(pds_vfio, &dirty->host_ack, + page_count); + if (err) { + __pds_vfio_dirty_free_sgl(pds_vfio, &dirty->host_seq); + return err; + } + + return 0; +} + +static int pds_vfio_dirty_enable(struct pds_vfio_pci_device *pds_vfio, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 region_start, region_size, region_page_size; + struct pds_lm_dirty_region_info *region_info; + struct interval_tree_node *node = NULL; + u8 max_regions = 0, num_regions; + dma_addr_t regions_dma = 0; + u32 num_ranges = nnodes; + u32 page_count; + u16 len; + int err; + + dev_dbg(&pdev->dev, "vf%u: Start dirty page tracking\n", + pds_vfio->vf_id); + + if (pds_vfio_dirty_is_enabled(pds_vfio)) + return -EINVAL; + + /* find if dirty tracking is disabled, i.e. num_regions == 0 */ + err = pds_vfio_dirty_status_cmd(pds_vfio, 0, &max_regions, + &num_regions); + if (err < 0) { + dev_err(&pdev->dev, "Failed to get dirty status, err %pe\n", + ERR_PTR(err)); + return err; + } else if (num_regions) { + dev_err(&pdev->dev, + "Dirty tracking already enabled for %d regions\n", + num_regions); + return -EEXIST; + } else if (!max_regions) { + dev_err(&pdev->dev, + "Device doesn't support dirty tracking, max_regions %d\n", + max_regions); + return -EOPNOTSUPP; + } + + /* + * Only support 1 region for now. If there are any large gaps in the + * VM's address regions, then this would be a waste of memory as we are + * generating 2 bitmaps (ack/seq) from the min address to the max + * address of the VM's address regions. In the future, if we support + * more than one region in the device/driver we can split the bitmaps + * on the largest address region gaps. We can do this split up to the + * max_regions times returned from the dirty_status command. + */ + max_regions = 1; + if (num_ranges > max_regions) { + vfio_combine_iova_ranges(ranges, nnodes, max_regions); + num_ranges = max_regions; + } + + node = interval_tree_iter_first(ranges, 0, ULONG_MAX); + if (!node) + return -EINVAL; + + region_size = node->last - node->start + 1; + region_start = node->start; + region_page_size = *page_size; + + len = sizeof(*region_info); + region_info = kzalloc(len, GFP_KERNEL); + if (!region_info) + return -ENOMEM; + + page_count = DIV_ROUND_UP(region_size, region_page_size); + + region_info->dma_base = cpu_to_le64(region_start); + region_info->page_count = cpu_to_le32(page_count); + region_info->page_size_log2 = ilog2(region_page_size); + + regions_dma = dma_map_single(pdsc_dev, (void *)region_info, len, + DMA_BIDIRECTIONAL); + if (dma_mapping_error(pdsc_dev, regions_dma)) { + err = -ENOMEM; + goto out_free_region_info; + } + + err = pds_vfio_dirty_enable_cmd(pds_vfio, regions_dma, max_regions); + dma_unmap_single(pdsc_dev, regions_dma, len, DMA_BIDIRECTIONAL); + if (err) + goto out_free_region_info; + + /* + * page_count might be adjusted by the device, + * update it before freeing region_info DMA + */ + page_count = le32_to_cpu(region_info->page_count); + + dev_dbg(&pdev->dev, + "region_info: regions_dma 0x%llx dma_base 0x%llx page_count %u page_size_log2 %u\n", + regions_dma, region_start, page_count, + (u8)ilog2(region_page_size)); + + err = pds_vfio_dirty_alloc_bitmaps(dirty, page_count / BITS_PER_BYTE); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty bitmaps: %pe\n", + ERR_PTR(err)); + goto out_free_region_info; + } + + err = pds_vfio_dirty_alloc_sgl(pds_vfio, page_count); + if (err) { + dev_err(&pdev->dev, "Failed to alloc dirty sg lists: %pe\n", + ERR_PTR(err)); + goto out_free_bitmaps; + } + + dirty->region_start = region_start; + dirty->region_size = region_size; + dirty->region_page_size = region_page_size; + pds_vfio_dirty_set_enabled(pds_vfio); + + pds_vfio_print_guest_region_info(pds_vfio, max_regions); + + kfree(region_info); + + return 0; + +out_free_bitmaps: + pds_vfio_dirty_free_bitmaps(dirty); +out_free_region_info: + kfree(region_info); + return err; +} + +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, bool send_cmd) +{ + if (pds_vfio_dirty_is_enabled(pds_vfio)) { + pds_vfio_dirty_set_disabled(pds_vfio); + if (send_cmd) + pds_vfio_dirty_disable_cmd(pds_vfio); + pds_vfio_dirty_free_sgl(pds_vfio); + pds_vfio_dirty_free_bitmaps(&pds_vfio->dirty); + } + + if (send_cmd) + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); +} + +static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device *pds_vfio, + struct pds_vfio_bmp_info *bmp_info, + u32 offset, u32 bmp_bytes, bool read_seq) +{ + const char *bmp_type_str = read_seq ? "read_seq" : "write_ack"; + u8 dma_dir = read_seq ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + struct pci_dev *pdev = pds_vfio->vfio_coredev.pdev; + struct device *pdsc_dev = &pci_physfn(pdev)->dev; + unsigned long long npages; + struct sg_table sg_table; + struct scatterlist *sg; + struct page **pages; + u32 page_offset; + const void *bmp; + size_t size; + u16 num_sge; + int err; + int i; + + bmp = (void *)((u64)bmp_info->bmp + offset); + page_offset = offset_in_page(bmp); + bmp -= page_offset; + + /* + * Start and end of bitmap section to seq/ack might not be page + * aligned, so use the page_offset to account for that so there + * will be enough pages to represent the bmp_bytes + */ + npages = DIV_ROUND_UP_ULL(bmp_bytes + page_offset, PAGE_SIZE); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + for (unsigned long long i = 0; i < npages; i++) { + struct page *page = vmalloc_to_page(bmp); + + if (!page) { + err = -EFAULT; + goto out_free_pages; + } + + pages[i] = page; + bmp += PAGE_SIZE; + } + + err = sg_alloc_table_from_pages(&sg_table, pages, npages, page_offset, + bmp_bytes, GFP_KERNEL); + if (err) + goto out_free_pages; + + err = dma_map_sgtable(pdsc_dev, &sg_table, dma_dir, 0); + if (err) + goto out_free_sg_table; + + for_each_sgtable_dma_sg(&sg_table, sg, i) { + struct pds_lm_sg_elem *sg_elem = &bmp_info->sgl[i]; + + sg_elem->addr = cpu_to_le64(sg_dma_address(sg)); + sg_elem->len = cpu_to_le32(sg_dma_len(sg)); + } + + num_sge = sg_table.nents; + size = num_sge * sizeof(struct pds_lm_sg_elem); + dma_sync_single_for_device(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + err = pds_vfio_dirty_seq_ack_cmd(pds_vfio, bmp_info->sgl_addr, num_sge, + offset, bmp_bytes, read_seq); + if (err) + dev_err(&pdev->dev, + "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n", + bmp_type_str, offset, bmp_bytes, + num_sge, bmp_info->sgl_addr, ERR_PTR(err)); + dma_sync_single_for_cpu(pdsc_dev, bmp_info->sgl_addr, size, dma_dir); + + dma_unmap_sgtable(pdsc_dev, &sg_table, dma_dir, 0); +out_free_sg_table: + sg_free_table(&sg_table); +out_free_pages: + kfree(pages); + + return err; +} + +static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_ack, + offset, len, WRITE_ACK); +} + +static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device *pds_vfio, + u32 offset, u32 len) +{ + return pds_vfio_dirty_seq_ack(pds_vfio, &pds_vfio->dirty.host_seq, + offset, len, READ_SEQ); +} + +static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + u32 bmp_offset, u32 len_bytes) +{ + u64 page_size = pds_vfio->dirty.region_page_size; + u64 region_start = pds_vfio->dirty.region_start; + u32 bmp_offset_bit; + __le64 *seq, *ack; + int dword_count; + + dword_count = len_bytes / sizeof(u64); + seq = (__le64 *)((u64)pds_vfio->dirty.host_seq.bmp + bmp_offset); + ack = (__le64 *)((u64)pds_vfio->dirty.host_ack.bmp + bmp_offset); + bmp_offset_bit = bmp_offset * 8; + + for (int i = 0; i < dword_count; i++) { + u64 xor = le64_to_cpu(seq[i]) ^ le64_to_cpu(ack[i]); + + /* prepare for next write_ack call */ + ack[i] = seq[i]; + + for (u8 bit_i = 0; bit_i < BITS_PER_TYPE(u64); ++bit_i) { + if (xor & BIT(bit_i)) { + u64 abs_bit_i = bmp_offset_bit + + i * BITS_PER_TYPE(u64) + bit_i; + u64 addr = abs_bit_i * page_size + region_start; + + iova_bitmap_set(dirty_bitmap, addr, page_size); + } + } + } + + return 0; +} + +static int pds_vfio_dirty_sync(struct pds_vfio_pci_device *pds_vfio, + struct iova_bitmap *dirty_bitmap, + unsigned long iova, unsigned long length) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_dirty *dirty = &pds_vfio->dirty; + u64 bmp_offset, bmp_bytes; + u64 bitmap_size, pages; + int err; + + dev_dbg(dev, "vf%u: Get dirty page bitmap\n", pds_vfio->vf_id); + + if (!pds_vfio_dirty_is_enabled(pds_vfio)) { + dev_err(dev, "vf%u: Sync failed, dirty tracking is disabled\n", + pds_vfio->vf_id); + return -EINVAL; + } + + pages = DIV_ROUND_UP(length, pds_vfio->dirty.region_page_size); + bitmap_size = + round_up(pages, sizeof(u64) * BITS_PER_BYTE) / BITS_PER_BYTE; + + dev_dbg(dev, + "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n", + pds_vfio->vf_id, iova, length, pds_vfio->dirty.region_page_size, + pages, bitmap_size); + + if (!length || ((dirty->region_start + iova + length) > + (dirty->region_start + dirty->region_size))) { + dev_err(dev, "Invalid iova 0x%lx and/or length 0x%lx to sync\n", + iova, length); + return -EINVAL; + } + + /* bitmap is modified in 64 bit chunks */ + bmp_bytes = ALIGN(DIV_ROUND_UP(length / dirty->region_page_size, + sizeof(u64)), + sizeof(u64)); + if (bmp_bytes != bitmap_size) { + dev_err(dev, + "Calculated bitmap bytes %llu not equal to bitmap size %llu\n", + bmp_bytes, bitmap_size); + return -EINVAL; + } + + bmp_offset = DIV_ROUND_UP(iova / dirty->region_page_size, sizeof(u64)); + + dev_dbg(dev, + "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n", + iova, length, bmp_offset, bmp_bytes); + + err = pds_vfio_dirty_read_seq(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_process_bitmaps(pds_vfio, dirty_bitmap, bmp_offset, + bmp_bytes); + if (err) + return err; + + err = pds_vfio_dirty_write_ack(pds_vfio, bmp_offset, bmp_bytes); + if (err) + return err; + + return 0; +} + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, struct iova_bitmap *dirty) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + err = pds_vfio_dirty_sync(pds_vfio, dirty, iova, length); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_IN_PROGRESS); + err = pds_vfio_dirty_enable(pds_vfio, ranges, nnodes, page_size); + pds_vfio_state_mutex_unlock(pds_vfio); + + return err; +} + +int pds_vfio_dma_logging_stop(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_dirty_disable(pds_vfio, true); + pds_vfio_state_mutex_unlock(pds_vfio); + + return 0; +} diff --git a/drivers/vfio/pci/pds/dirty.h b/drivers/vfio/pci/pds/dirty.h new file mode 100644 index 000000000000..f78da25d75ca --- /dev/null +++ b/drivers/vfio/pci/pds/dirty.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _DIRTY_H_ +#define _DIRTY_H_ + +struct pds_vfio_bmp_info { + unsigned long *bmp; + u32 bmp_bytes; + struct pds_lm_sg_elem *sgl; + dma_addr_t sgl_addr; + u16 num_sge; +}; + +struct pds_vfio_dirty { + struct pds_vfio_bmp_info host_seq; + struct pds_vfio_bmp_info host_ack; + u64 region_size; + u64 region_start; + u64 region_page_size; + bool is_enabled; +}; + +struct pds_vfio_pci_device; + +bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_dirty_disable(struct pds_vfio_pci_device *pds_vfio, + bool send_cmd); + +int pds_vfio_dma_logging_report(struct vfio_device *vdev, unsigned long iova, + unsigned long length, + struct iova_bitmap *dirty); +int pds_vfio_dma_logging_start(struct vfio_device *vdev, + struct rb_root_cached *ranges, u32 nnodes, + u64 *page_size); +int pds_vfio_dma_logging_stop(struct vfio_device *vdev); +#endif /* _DIRTY_H_ */ diff --git a/drivers/vfio/pci/pds/lm.c b/drivers/vfio/pci/pds/lm.c new file mode 100644 index 000000000000..79fe2e66bb49 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.c @@ -0,0 +1,434 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include "vfio_dev.h" +#include "cmds.h" + +static struct pds_vfio_lm_file * +pds_vfio_get_lm_file(const struct file_operations *fops, int flags, u64 size) +{ + struct pds_vfio_lm_file *lm_file = NULL; + unsigned long long npages; + struct page **pages; + void *page_mem; + const void *p; + + if (!size) + return NULL; + + /* Alloc file structure */ + lm_file = kzalloc(sizeof(*lm_file), GFP_KERNEL); + if (!lm_file) + return NULL; + + /* Create file */ + lm_file->filep = + anon_inode_getfile("pds_vfio_lm", fops, lm_file, flags); + if (IS_ERR(lm_file->filep)) + goto out_free_file; + + stream_open(lm_file->filep->f_inode, lm_file->filep); + mutex_init(&lm_file->lock); + + /* prevent file from being released before we are done with it */ + get_file(lm_file->filep); + + /* Allocate memory for file pages */ + npages = DIV_ROUND_UP_ULL(size, PAGE_SIZE); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + if (!pages) + goto out_put_file; + + page_mem = kvzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL); + if (!page_mem) + goto out_free_pages_array; + + p = page_mem - offset_in_page(page_mem); + for (unsigned long long i = 0; i < npages; i++) { + if (is_vmalloc_addr(p)) + pages[i] = vmalloc_to_page(p); + else + pages[i] = kmap_to_page((void *)p); + if (!pages[i]) + goto out_free_page_mem; + + p += PAGE_SIZE; + } + + /* Create scatterlist of file pages to use for DMA mapping later */ + if (sg_alloc_table_from_pages(&lm_file->sg_table, pages, npages, 0, + size, GFP_KERNEL)) + goto out_free_page_mem; + + lm_file->size = size; + lm_file->pages = pages; + lm_file->npages = npages; + lm_file->page_mem = page_mem; + lm_file->alloc_size = npages * PAGE_SIZE; + + return lm_file; + +out_free_page_mem: + kvfree(page_mem); +out_free_pages_array: + kfree(pages); +out_put_file: + fput(lm_file->filep); + mutex_destroy(&lm_file->lock); +out_free_file: + kfree(lm_file); + + return NULL; +} + +static void pds_vfio_put_lm_file(struct pds_vfio_lm_file *lm_file) +{ + mutex_lock(&lm_file->lock); + + lm_file->size = 0; + lm_file->alloc_size = 0; + + /* Free scatter list of file pages */ + sg_free_table(&lm_file->sg_table); + + kvfree(lm_file->page_mem); + lm_file->page_mem = NULL; + kfree(lm_file->pages); + lm_file->pages = NULL; + + mutex_unlock(&lm_file->lock); + + /* allow file to be released since we are done with it */ + fput(lm_file->filep); +} + +void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->save_file) + return; + + pds_vfio_put_lm_file(pds_vfio->save_file); + pds_vfio->save_file = NULL; +} + +void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + if (!pds_vfio->restore_file) + return; + + pds_vfio_put_lm_file(pds_vfio->restore_file); + pds_vfio->restore_file = NULL; +} + +static struct page *pds_vfio_get_file_page(struct pds_vfio_lm_file *lm_file, + unsigned long offset) +{ + unsigned long cur_offset = 0; + struct scatterlist *sg; + unsigned int i; + + /* All accesses are sequential */ + if (offset < lm_file->last_offset || !lm_file->last_offset_sg) { + lm_file->last_offset = 0; + lm_file->last_offset_sg = lm_file->sg_table.sgl; + lm_file->sg_last_entry = 0; + } + + cur_offset = lm_file->last_offset; + + for_each_sg(lm_file->last_offset_sg, sg, + lm_file->sg_table.orig_nents - lm_file->sg_last_entry, i) { + if (offset < sg->length + cur_offset) { + lm_file->last_offset_sg = sg; + lm_file->sg_last_entry += i; + lm_file->last_offset = cur_offset; + return nth_page(sg_page(sg), + (offset - cur_offset) / PAGE_SIZE); + } + cur_offset += sg->length; + } + + return NULL; +} + +static int pds_vfio_release_file(struct inode *inode, struct file *filp) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + + mutex_lock(&lm_file->lock); + lm_file->filep->f_pos = 0; + lm_file->size = 0; + mutex_unlock(&lm_file->lock); + mutex_destroy(&lm_file->lock); + kfree(lm_file); + + return 0; +} + +static ssize_t pds_vfio_save_read(struct file *filp, char __user *buf, + size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&lm_file->lock); + if (*pos > lm_file->size) { + done = -EINVAL; + goto out_unlock; + } + + len = min_t(size_t, lm_file->size - *pos, len); + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + err = copy_to_user(buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + } + +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations pds_vfio_save_fops = { + .owner = THIS_MODULE, + .read = pds_vfio_save_read, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int pds_vfio_get_save_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_lm_file *lm_file; + u64 size; + int err; + + /* Get live migration state size in this state */ + err = pds_vfio_get_lm_state_size_cmd(pds_vfio, &size); + if (err) { + dev_err(dev, "failed to get save status: %pe\n", ERR_PTR(err)); + return err; + } + + dev_dbg(dev, "save status, size = %lld\n", size); + + if (!size) { + dev_err(dev, "invalid state size\n"); + return -EIO; + } + + lm_file = pds_vfio_get_lm_file(&pds_vfio_save_fops, O_RDONLY, size); + if (!lm_file) { + dev_err(dev, "failed to create save file\n"); + return -ENOENT; + } + + dev_dbg(dev, "size = %lld, alloc_size = %lld, npages = %lld\n", + lm_file->size, lm_file->alloc_size, lm_file->npages); + + pds_vfio->save_file = lm_file; + + return 0; +} + +static ssize_t pds_vfio_restore_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct pds_vfio_lm_file *lm_file = filp->private_data; + loff_t requested_length; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + + if (*pos < 0 || + check_add_overflow((loff_t)len, *pos, &requested_length)) + return -EINVAL; + + mutex_lock(&lm_file->lock); + + while (len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int err; + + page_offset = (*pos) % PAGE_SIZE; + page = pds_vfio_get_file_page(lm_file, *pos - page_offset); + if (!page) { + if (done == 0) + done = -EINVAL; + goto out_unlock; + } + + page_len = min_t(size_t, len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + err = copy_from_user(to_buff + page_offset, buf, page_len); + kunmap_local(to_buff); + if (err) { + done = -EFAULT; + goto out_unlock; + } + *pos += page_len; + len -= page_len; + done += page_len; + buf += page_len; + lm_file->size += page_len; + } +out_unlock: + mutex_unlock(&lm_file->lock); + return done; +} + +static const struct file_operations pds_vfio_restore_fops = { + .owner = THIS_MODULE, + .write = pds_vfio_restore_write, + .release = pds_vfio_release_file, + .llseek = no_llseek, +}; + +static int pds_vfio_get_restore_file(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = &pds_vfio->vfio_coredev.pdev->dev; + struct pds_vfio_lm_file *lm_file; + u64 size; + + size = sizeof(union pds_lm_dev_state); + dev_dbg(dev, "restore status, size = %lld\n", size); + + if (!size) { + dev_err(dev, "invalid state size"); + return -EIO; + } + + lm_file = pds_vfio_get_lm_file(&pds_vfio_restore_fops, O_WRONLY, size); + if (!lm_file) { + dev_err(dev, "failed to create restore file"); + return -ENOENT; + } + pds_vfio->restore_file = lm_file; + + return 0; +} + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next) +{ + enum vfio_device_mig_state cur = pds_vfio->state; + int err; + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_STOP_COPY) { + err = pds_vfio_get_save_file(pds_vfio); + if (err) + return ERR_PTR(err); + + err = pds_vfio_get_lm_state_cmd(pds_vfio); + if (err) { + pds_vfio_put_save_file(pds_vfio); + return ERR_PTR(err); + } + + return pds_vfio->save_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_STOP_COPY && next == VFIO_DEVICE_STATE_STOP) { + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, true); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RESUMING) { + err = pds_vfio_get_restore_file(pds_vfio); + if (err) + return ERR_PTR(err); + + return pds_vfio->restore_file->filep; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && next == VFIO_DEVICE_STATE_STOP) { + err = pds_vfio_set_lm_state_cmd(pds_vfio); + if (err) + return ERR_PTR(err); + + pds_vfio_put_restore_file(pds_vfio); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING && next == VFIO_DEVICE_STATE_RUNNING_P2P) { + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, + PDS_LM_STA_IN_PROGRESS); + err = pds_vfio_suspend_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_P2P); + if (err) + return ERR_PTR(err); + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_RUNNING) { + err = pds_vfio_resume_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_FULL); + if (err) + return ERR_PTR(err); + + pds_vfio_send_host_vf_lm_status_cmd(pds_vfio, PDS_LM_STA_NONE); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && next == VFIO_DEVICE_STATE_RUNNING_P2P) { + err = pds_vfio_resume_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_P2P); + if (err) + return ERR_PTR(err); + + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && next == VFIO_DEVICE_STATE_STOP) { + err = pds_vfio_suspend_device_cmd(pds_vfio, + PDS_LM_SUSPEND_RESUME_TYPE_FULL); + if (err) + return ERR_PTR(err); + return NULL; + } + + return ERR_PTR(-EINVAL); +} diff --git a/drivers/vfio/pci/pds/lm.h b/drivers/vfio/pci/pds/lm.h new file mode 100644 index 000000000000..13be893198b7 --- /dev/null +++ b/drivers/vfio/pci/pds/lm.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _LM_H_ +#define _LM_H_ + +#include <linux/fs.h> +#include <linux/mutex.h> +#include <linux/scatterlist.h> +#include <linux/types.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_adminq.h> + +struct pds_vfio_lm_file { + struct file *filep; + struct mutex lock; /* protect live migration data file */ + u64 size; /* Size with valid data */ + u64 alloc_size; /* Total allocated size. Always >= len */ + void *page_mem; /* memory allocated for pages */ + struct page **pages; /* Backing pages for file */ + unsigned long long npages; + struct sg_table sg_table; /* SG table for backing pages */ + struct pds_lm_sg_elem *sgl; /* DMA mapping */ + dma_addr_t sgl_addr; + u16 num_sge; + struct scatterlist *last_offset_sg; /* Iterator */ + unsigned int sg_last_entry; + unsigned long last_offset; +}; + +struct pds_vfio_pci_device; + +struct file * +pds_vfio_step_device_state_locked(struct pds_vfio_pci_device *pds_vfio, + enum vfio_device_mig_state next); + +void pds_vfio_put_save_file(struct pds_vfio_pci_device *pds_vfio); +void pds_vfio_put_restore_file(struct pds_vfio_pci_device *pds_vfio); + +#endif /* _LM_H_ */ diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c new file mode 100644 index 000000000000..ab4b5958e413 --- /dev/null +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/types.h> +#include <linux/vfio.h> + +#include <linux/pds/pds_common.h> +#include <linux/pds/pds_core_if.h> +#include <linux/pds/pds_adminq.h> + +#include "vfio_dev.h" +#include "pci_drv.h" +#include "cmds.h" + +#define PDS_VFIO_DRV_DESCRIPTION "AMD/Pensando VFIO Device Driver" +#define PCI_VENDOR_ID_PENSANDO 0x1dd8 + +static void pds_vfio_recovery(struct pds_vfio_pci_device *pds_vfio) +{ + bool deferred_reset_needed = false; + + /* + * Documentation states that the kernel migration driver must not + * generate asynchronous device state transitions outside of + * manipulation by the user or the VFIO_DEVICE_RESET ioctl. + * + * Since recovery is an asynchronous event received from the device, + * initiate a deferred reset. Issue a deferred reset in the following + * situations: + * 1. Migration is in progress, which will cause the next step of + * the migration to fail. + * 2. If the device is in a state that will be set to + * VFIO_DEVICE_STATE_RUNNING on the next action (i.e. VM is + * shutdown and device is in VFIO_DEVICE_STATE_STOP). + */ + mutex_lock(&pds_vfio->state_mutex); + if ((pds_vfio->state != VFIO_DEVICE_STATE_RUNNING && + pds_vfio->state != VFIO_DEVICE_STATE_ERROR) || + (pds_vfio->state == VFIO_DEVICE_STATE_RUNNING && + pds_vfio_dirty_is_enabled(pds_vfio))) + deferred_reset_needed = true; + mutex_unlock(&pds_vfio->state_mutex); + + /* + * On the next user initiated state transition, the device will + * transition to the VFIO_DEVICE_STATE_ERROR. At this point it's the user's + * responsibility to reset the device. + * + * If a VFIO_DEVICE_RESET is requested post recovery and before the next + * state transition, then the deferred reset state will be set to + * VFIO_DEVICE_STATE_RUNNING. + */ + if (deferred_reset_needed) { + spin_lock(&pds_vfio->reset_lock); + pds_vfio->deferred_reset = true; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_ERROR; + spin_unlock(&pds_vfio->reset_lock); + } +} + +static int pds_vfio_pci_notify_handler(struct notifier_block *nb, + unsigned long ecode, void *data) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(nb, struct pds_vfio_pci_device, nb); + struct device *dev = pds_vfio_to_dev(pds_vfio); + union pds_core_notifyq_comp *event = data; + + dev_dbg(dev, "%s: event code %lu\n", __func__, ecode); + + /* + * We don't need to do anything for RESET state==0 as there is no notify + * or feedback mechanism available, and it is possible that we won't + * even see a state==0 event since the pds_core recovery is pending. + * + * Any requests from VFIO while state==0 will fail, which will return + * error and may cause migration to fail. + */ + if (ecode == PDS_EVENT_RESET) { + dev_info(dev, "%s: PDS_EVENT_RESET event received, state==%d\n", + __func__, event->reset.state); + /* + * pds_core device finished recovery and sent us the + * notification (state == 1) to allow us to recover + */ + if (event->reset.state == 1) + pds_vfio_recovery(pds_vfio); + } + + return 0; +} + +static int +pds_vfio_pci_register_event_handler(struct pds_vfio_pci_device *pds_vfio) +{ + struct device *dev = pds_vfio_to_dev(pds_vfio); + struct notifier_block *nb = &pds_vfio->nb; + int err; + + if (!nb->notifier_call) { + nb->notifier_call = pds_vfio_pci_notify_handler; + err = pdsc_register_notify(nb); + if (err) { + nb->notifier_call = NULL; + dev_err(dev, + "failed to register pds event handler: %pe\n", + ERR_PTR(err)); + return -EINVAL; + } + dev_dbg(dev, "pds event handler registered\n"); + } + + return 0; +} + +static void +pds_vfio_pci_unregister_event_handler(struct pds_vfio_pci_device *pds_vfio) +{ + if (pds_vfio->nb.notifier_call) { + pdsc_unregister_notify(&pds_vfio->nb); + pds_vfio->nb.notifier_call = NULL; + } +} + +static int pds_vfio_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct pds_vfio_pci_device *pds_vfio; + int err; + + pds_vfio = vfio_alloc_device(pds_vfio_pci_device, vfio_coredev.vdev, + &pdev->dev, pds_vfio_ops_info()); + if (IS_ERR(pds_vfio)) + return PTR_ERR(pds_vfio); + + dev_set_drvdata(&pdev->dev, &pds_vfio->vfio_coredev); + + err = vfio_pci_core_register_device(&pds_vfio->vfio_coredev); + if (err) + goto out_put_vdev; + + err = pds_vfio_register_client_cmd(pds_vfio); + if (err) { + dev_err(&pdev->dev, "failed to register as client: %pe\n", + ERR_PTR(err)); + goto out_unregister_coredev; + } + + err = pds_vfio_pci_register_event_handler(pds_vfio); + if (err) + goto out_unregister_client; + + return 0; + +out_unregister_client: + pds_vfio_unregister_client_cmd(pds_vfio); +out_unregister_coredev: + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); +out_put_vdev: + vfio_put_device(&pds_vfio->vfio_coredev.vdev); + return err; +} + +static void pds_vfio_pci_remove(struct pci_dev *pdev) +{ + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); + + pds_vfio_pci_unregister_event_handler(pds_vfio); + pds_vfio_unregister_client_cmd(pds_vfio); + vfio_pci_core_unregister_device(&pds_vfio->vfio_coredev); + vfio_put_device(&pds_vfio->vfio_coredev.vdev); +} + +static const struct pci_device_id pds_vfio_pci_table[] = { + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_PENSANDO, 0x1003) }, /* Ethernet VF */ + { 0, } +}; +MODULE_DEVICE_TABLE(pci, pds_vfio_pci_table); + +static void pds_vfio_pci_aer_reset_done(struct pci_dev *pdev) +{ + struct pds_vfio_pci_device *pds_vfio = pds_vfio_pci_drvdata(pdev); + + pds_vfio_reset(pds_vfio); +} + +static const struct pci_error_handlers pds_vfio_pci_err_handlers = { + .reset_done = pds_vfio_pci_aer_reset_done, + .error_detected = vfio_pci_core_aer_err_detected, +}; + +static struct pci_driver pds_vfio_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = pds_vfio_pci_table, + .probe = pds_vfio_pci_probe, + .remove = pds_vfio_pci_remove, + .err_handler = &pds_vfio_pci_err_handlers, + .driver_managed_dma = true, +}; + +module_pci_driver(pds_vfio_pci_driver); + +MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); +MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/pci/pds/pci_drv.h b/drivers/vfio/pci/pds/pci_drv.h new file mode 100644 index 000000000000..e79bed12ed14 --- /dev/null +++ b/drivers/vfio/pci/pds/pci_drv.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _PCI_DRV_H +#define _PCI_DRV_H + +#include <linux/pci.h> + +#endif /* _PCI_DRV_H */ diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c new file mode 100644 index 000000000000..b46174f5eb09 --- /dev/null +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#include <linux/vfio.h> +#include <linux/vfio_pci_core.h> + +#include "lm.h" +#include "dirty.h" +#include "vfio_dev.h" + +struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio) +{ + return pds_vfio->vfio_coredev.pdev; +} + +struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio) +{ + return &pds_vfio_to_pci_dev(pds_vfio)->dev; +} + +struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev) +{ + struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); + + return container_of(core_device, struct pds_vfio_pci_device, + vfio_coredev); +} + +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio) +{ +again: + spin_lock(&pds_vfio->reset_lock); + if (pds_vfio->deferred_reset) { + pds_vfio->deferred_reset = false; + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) { + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, false); + } + pds_vfio->state = pds_vfio->deferred_reset_state; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + spin_unlock(&pds_vfio->reset_lock); + goto again; + } + mutex_unlock(&pds_vfio->state_mutex); + spin_unlock(&pds_vfio->reset_lock); +} + +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio) +{ + spin_lock(&pds_vfio->reset_lock); + pds_vfio->deferred_reset = true; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + if (!mutex_trylock(&pds_vfio->state_mutex)) { + spin_unlock(&pds_vfio->reset_lock); + return; + } + spin_unlock(&pds_vfio->reset_lock); + pds_vfio_state_mutex_unlock(pds_vfio); +} + +static struct file * +pds_vfio_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + struct file *res = NULL; + + mutex_lock(&pds_vfio->state_mutex); + /* + * only way to transition out of VFIO_DEVICE_STATE_ERROR is via + * VFIO_DEVICE_RESET, so prevent the state machine from running since + * vfio_mig_get_next_state() will throw a WARN_ON() when transitioning + * from VFIO_DEVICE_STATE_ERROR to any other state + */ + while (pds_vfio->state != VFIO_DEVICE_STATE_ERROR && + new_state != pds_vfio->state) { + enum vfio_device_mig_state next_state; + + int err = vfio_mig_get_next_state(vdev, pds_vfio->state, + new_state, &next_state); + if (err) { + res = ERR_PTR(err); + break; + } + + res = pds_vfio_step_device_state_locked(pds_vfio, next_state); + if (IS_ERR(res)) + break; + + pds_vfio->state = next_state; + + if (WARN_ON(res && new_state != pds_vfio->state)) { + res = ERR_PTR(-EINVAL); + break; + } + } + pds_vfio_state_mutex_unlock(pds_vfio); + /* still waiting on a deferred_reset */ + if (pds_vfio->state == VFIO_DEVICE_STATE_ERROR) + res = ERR_PTR(-EIO); + + return res; +} + +static int pds_vfio_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *current_state) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + *current_state = pds_vfio->state; + pds_vfio_state_mutex_unlock(pds_vfio); + return 0; +} + +static int pds_vfio_get_device_state_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + *stop_copy_length = PDS_LM_DEVICE_STATE_LENGTH; + return 0; +} + +static const struct vfio_migration_ops pds_vfio_lm_ops = { + .migration_set_state = pds_vfio_set_device_state, + .migration_get_state = pds_vfio_get_device_state, + .migration_get_data_size = pds_vfio_get_device_state_size +}; + +static const struct vfio_log_ops pds_vfio_log_ops = { + .log_start = pds_vfio_dma_logging_start, + .log_stop = pds_vfio_dma_logging_stop, + .log_read_and_clear = pds_vfio_dma_logging_report, +}; + +static int pds_vfio_init_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + struct pci_dev *pdev = to_pci_dev(vdev->dev); + int err, vf_id, pci_id; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + err = vfio_pci_core_init_dev(vdev); + if (err) + return err; + + pds_vfio->vf_id = vf_id; + + vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P; + vdev->mig_ops = &pds_vfio_lm_ops; + vdev->log_ops = &pds_vfio_log_ops; + + pci_id = PCI_DEVID(pdev->bus->number, pdev->devfn); + dev_dbg(&pdev->dev, + "%s: PF %#04x VF %#04x vf_id %d domain %d pds_vfio %p\n", + __func__, pci_dev_id(pdev->physfn), pci_id, vf_id, + pci_domain_nr(pdev->bus), pds_vfio); + + return 0; +} + +static int pds_vfio_open_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + int err; + + err = vfio_pci_core_enable(&pds_vfio->vfio_coredev); + if (err) + return err; + + mutex_init(&pds_vfio->state_mutex); + pds_vfio->state = VFIO_DEVICE_STATE_RUNNING; + pds_vfio->deferred_reset_state = VFIO_DEVICE_STATE_RUNNING; + + vfio_pci_core_finish_enable(&pds_vfio->vfio_coredev); + + return 0; +} + +static void pds_vfio_close_device(struct vfio_device *vdev) +{ + struct pds_vfio_pci_device *pds_vfio = + container_of(vdev, struct pds_vfio_pci_device, + vfio_coredev.vdev); + + mutex_lock(&pds_vfio->state_mutex); + pds_vfio_put_restore_file(pds_vfio); + pds_vfio_put_save_file(pds_vfio); + pds_vfio_dirty_disable(pds_vfio, true); + mutex_unlock(&pds_vfio->state_mutex); + mutex_destroy(&pds_vfio->state_mutex); + vfio_pci_core_close_device(vdev); +} + +static const struct vfio_device_ops pds_vfio_ops = { + .name = "pds-vfio", + .init = pds_vfio_init_device, + .release = vfio_pci_core_release_dev, + .open_device = pds_vfio_open_device, + .close_device = pds_vfio_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, +}; + +const struct vfio_device_ops *pds_vfio_ops_info(void) +{ + return &pds_vfio_ops; +} diff --git a/drivers/vfio/pci/pds/vfio_dev.h b/drivers/vfio/pci/pds/vfio_dev.h new file mode 100644 index 000000000000..b8f2d667608f --- /dev/null +++ b/drivers/vfio/pci/pds/vfio_dev.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2023 Advanced Micro Devices, Inc. */ + +#ifndef _VFIO_DEV_H_ +#define _VFIO_DEV_H_ + +#include <linux/pci.h> +#include <linux/vfio_pci_core.h> + +#include "dirty.h" +#include "lm.h" + +struct pds_vfio_pci_device { + struct vfio_pci_core_device vfio_coredev; + + struct pds_vfio_lm_file *save_file; + struct pds_vfio_lm_file *restore_file; + struct pds_vfio_dirty dirty; + struct mutex state_mutex; /* protect migration state */ + enum vfio_device_mig_state state; + spinlock_t reset_lock; /* protect reset_done flow */ + u8 deferred_reset; + enum vfio_device_mig_state deferred_reset_state; + struct notifier_block nb; + + int vf_id; + u16 client_id; +}; + +void pds_vfio_state_mutex_unlock(struct pds_vfio_pci_device *pds_vfio); + +const struct vfio_device_ops *pds_vfio_ops_info(void); +struct pds_vfio_pci_device *pds_vfio_pci_drvdata(struct pci_dev *pdev); +void pds_vfio_reset(struct pds_vfio_pci_device *pds_vfio); + +struct pci_dev *pds_vfio_to_pci_dev(struct pds_vfio_pci_device *pds_vfio); +struct device *pds_vfio_to_dev(struct pds_vfio_pci_device *pds_vfio); + +#endif /* _VFIO_DEV_H_ */ diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 29091ee2e984..cb5b7f865d58 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -141,6 +141,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 20d7b69ea6ff..1929103ee59a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -27,6 +27,7 @@ #include <linux/vgaarb.h> #include <linux/nospec.h> #include <linux/sched/mm.h> +#include <linux/iommufd.h> #if IS_ENABLED(CONFIG_EEH) #include <asm/eeh.h> #endif @@ -180,7 +181,8 @@ no_mmap: struct vfio_pci_group_info; static void vfio_pci_dev_set_try_reset(struct vfio_device_set *dev_set); static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups); + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx); /* * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND @@ -776,29 +778,65 @@ static int vfio_pci_count_devs(struct pci_dev *pdev, void *data) } struct vfio_pci_fill_info { - int max; - int cur; - struct vfio_pci_dependent_device *devices; + struct vfio_pci_dependent_device __user *devices; + struct vfio_pci_dependent_device __user *devices_end; + struct vfio_device *vdev; + u32 count; + u32 flags; }; static int vfio_pci_fill_devs(struct pci_dev *pdev, void *data) { + struct vfio_pci_dependent_device info = { + .segment = pci_domain_nr(pdev->bus), + .bus = pdev->bus->number, + .devfn = pdev->devfn, + }; struct vfio_pci_fill_info *fill = data; - struct iommu_group *iommu_group; - if (fill->cur == fill->max) - return -EAGAIN; /* Something changed, try again */ + fill->count++; + if (fill->devices >= fill->devices_end) + return 0; + + if (fill->flags & VFIO_PCI_HOT_RESET_FLAG_DEV_ID) { + struct iommufd_ctx *iommufd = vfio_iommufd_device_ictx(fill->vdev); + struct vfio_device_set *dev_set = fill->vdev->dev_set; + struct vfio_device *vdev; + + /* + * hot-reset requires all affected devices be represented in + * the dev_set. + */ + vdev = vfio_find_device_in_devset(dev_set, &pdev->dev); + if (!vdev) { + info.devid = VFIO_PCI_DEVID_NOT_OWNED; + } else { + int id = vfio_iommufd_get_dev_id(vdev, iommufd); + + if (id > 0) + info.devid = id; + else if (id == -ENOENT) + info.devid = VFIO_PCI_DEVID_OWNED; + else + info.devid = VFIO_PCI_DEVID_NOT_OWNED; + } + /* If devid is VFIO_PCI_DEVID_NOT_OWNED, clear owned flag. */ + if (info.devid == VFIO_PCI_DEVID_NOT_OWNED) + fill->flags &= ~VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + } else { + struct iommu_group *iommu_group; + + iommu_group = iommu_group_get(&pdev->dev); + if (!iommu_group) + return -EPERM; /* Cannot reset non-isolated devices */ - iommu_group = iommu_group_get(&pdev->dev); - if (!iommu_group) - return -EPERM; /* Cannot reset non-isolated devices */ + info.group_id = iommu_group_id(iommu_group); + iommu_group_put(iommu_group); + } - fill->devices[fill->cur].group_id = iommu_group_id(iommu_group); - fill->devices[fill->cur].segment = pci_domain_nr(pdev->bus); - fill->devices[fill->cur].bus = pdev->bus->number; - fill->devices[fill->cur].devfn = pdev->devfn; - fill->cur++; - iommu_group_put(iommu_group); + if (copy_to_user(fill->devices, &info, sizeof(info))) + return -EFAULT; + fill->devices++; return 0; } @@ -920,24 +958,17 @@ static int vfio_pci_ioctl_get_info(struct vfio_pci_core_device *vdev, struct vfio_device_info __user *arg) { unsigned long minsz = offsetofend(struct vfio_device_info, num_irqs); - struct vfio_device_info info; + struct vfio_device_info info = {}; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; int ret; - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; - } + minsz = min_t(size_t, info.argsz, sizeof(info)); info.flags = VFIO_DEVICE_FLAGS_PCI; @@ -1228,8 +1259,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( unsigned long minsz = offsetofend(struct vfio_pci_hot_reset_info, count); struct vfio_pci_hot_reset_info hdr; - struct vfio_pci_fill_info fill = { 0 }; - struct vfio_pci_dependent_device *devices = NULL; + struct vfio_pci_fill_info fill = {}; bool slot = false; int ret = 0; @@ -1247,78 +1277,42 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info( else if (pci_probe_reset_bus(vdev->pdev->bus)) return -ENODEV; - /* How many devices are affected? */ - ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs, - &fill.max, slot); - if (ret) - return ret; + fill.devices = arg->devices; + fill.devices_end = arg->devices + + (hdr.argsz - sizeof(hdr)) / sizeof(arg->devices[0]); + fill.vdev = &vdev->vdev; - WARN_ON(!fill.max); /* Should always be at least one */ - - /* - * If there's enough space, fill it now, otherwise return -ENOSPC and - * the number of devices affected. - */ - if (hdr.argsz < sizeof(hdr) + (fill.max * sizeof(*devices))) { - ret = -ENOSPC; - hdr.count = fill.max; - goto reset_info_exit; - } - - devices = kcalloc(fill.max, sizeof(*devices), GFP_KERNEL); - if (!devices) - return -ENOMEM; - - fill.devices = devices; + if (vfio_device_cdev_opened(&vdev->vdev)) + fill.flags |= VFIO_PCI_HOT_RESET_FLAG_DEV_ID | + VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED; + mutex_lock(&vdev->vdev.dev_set->lock); ret = vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_fill_devs, &fill, slot); + mutex_unlock(&vdev->vdev.dev_set->lock); + if (ret) + return ret; - /* - * If a device was removed between counting and filling, we may come up - * short of fill.max. If a device was added, we'll have a return of - * -EAGAIN above. - */ - if (!ret) - hdr.count = fill.cur; - -reset_info_exit: + hdr.count = fill.count; + hdr.flags = fill.flags; if (copy_to_user(arg, &hdr, minsz)) - ret = -EFAULT; - - if (!ret) { - if (copy_to_user(&arg->devices, devices, - hdr.count * sizeof(*devices))) - ret = -EFAULT; - } + return -EFAULT; - kfree(devices); - return ret; + if (fill.count > fill.devices - arg->devices) + return -ENOSPC; + return 0; } -static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, - struct vfio_pci_hot_reset __user *arg) +static int +vfio_pci_ioctl_pci_hot_reset_groups(struct vfio_pci_core_device *vdev, + int array_count, bool slot, + struct vfio_pci_hot_reset __user *arg) { - unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); - struct vfio_pci_hot_reset hdr; int32_t *group_fds; struct file **files; struct vfio_pci_group_info info; - bool slot = false; int file_idx, count = 0, ret = 0; - if (copy_from_user(&hdr, arg, minsz)) - return -EFAULT; - - if (hdr.argsz < minsz || hdr.flags) - return -EINVAL; - - /* Can we do a slot or bus reset or neither? */ - if (!pci_probe_reset_slot(vdev->pdev->slot)) - slot = true; - else if (pci_probe_reset_bus(vdev->pdev->bus)) - return -ENODEV; - /* * We can't let userspace give us an arbitrarily large buffer to copy, * so verify how many we think there could be. Note groups can have @@ -1329,12 +1323,11 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, if (ret) return ret; - /* Somewhere between 1 and count is OK */ - if (!hdr.count || hdr.count > count) + if (array_count > count) return -EINVAL; - group_fds = kcalloc(hdr.count, sizeof(*group_fds), GFP_KERNEL); - files = kcalloc(hdr.count, sizeof(*files), GFP_KERNEL); + group_fds = kcalloc(array_count, sizeof(*group_fds), GFP_KERNEL); + files = kcalloc(array_count, sizeof(*files), GFP_KERNEL); if (!group_fds || !files) { kfree(group_fds); kfree(files); @@ -1342,18 +1335,17 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, } if (copy_from_user(group_fds, arg->group_fds, - hdr.count * sizeof(*group_fds))) { + array_count * sizeof(*group_fds))) { kfree(group_fds); kfree(files); return -EFAULT; } /* - * For each group_fd, get the group through the vfio external user - * interface and store the group and iommu ID. This ensures the group - * is held across the reset. + * Get the group file for each fd to ensure the group is held across + * the reset */ - for (file_idx = 0; file_idx < hdr.count; file_idx++) { + for (file_idx = 0; file_idx < array_count; file_idx++) { struct file *file = fget(group_fds[file_idx]); if (!file) { @@ -1377,10 +1369,10 @@ static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, if (ret) goto hot_reset_release; - info.count = hdr.count; + info.count = array_count; info.files = files; - ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info); + ret = vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, &info, NULL); hot_reset_release: for (file_idx--; file_idx >= 0; file_idx--) @@ -1390,6 +1382,36 @@ hot_reset_release: return ret; } +static int vfio_pci_ioctl_pci_hot_reset(struct vfio_pci_core_device *vdev, + struct vfio_pci_hot_reset __user *arg) +{ + unsigned long minsz = offsetofend(struct vfio_pci_hot_reset, count); + struct vfio_pci_hot_reset hdr; + bool slot = false; + + if (copy_from_user(&hdr, arg, minsz)) + return -EFAULT; + + if (hdr.argsz < minsz || hdr.flags) + return -EINVAL; + + /* zero-length array is only for cdev opened devices */ + if (!!hdr.count == vfio_device_cdev_opened(&vdev->vdev)) + return -EINVAL; + + /* Can we do a slot or bus reset or neither? */ + if (!pci_probe_reset_slot(vdev->pdev->slot)) + slot = true; + else if (pci_probe_reset_bus(vdev->pdev->bus)) + return -ENODEV; + + if (hdr.count) + return vfio_pci_ioctl_pci_hot_reset_groups(vdev, hdr.count, slot, arg); + + return vfio_pci_dev_set_hot_reset(vdev->vdev.dev_set, NULL, + vfio_iommufd_device_ictx(&vdev->vdev)); +} + static int vfio_pci_ioctl_ioeventfd(struct vfio_pci_core_device *vdev, struct vfio_device_ioeventfd __user *arg) { @@ -2355,13 +2377,16 @@ const struct pci_error_handlers vfio_pci_core_err_handlers = { }; EXPORT_SYMBOL_GPL(vfio_pci_core_err_handlers); -static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, +static bool vfio_dev_in_groups(struct vfio_device *vdev, struct vfio_pci_group_info *groups) { unsigned int i; + if (!groups) + return false; + for (i = 0; i < groups->count; i++) - if (vfio_file_has_dev(groups->files[i], &vdev->vdev)) + if (vfio_file_has_dev(groups->files[i], vdev)) return true; return false; } @@ -2369,12 +2394,8 @@ static bool vfio_dev_in_groups(struct vfio_pci_core_device *vdev, static int vfio_pci_is_device_in_set(struct pci_dev *pdev, void *data) { struct vfio_device_set *dev_set = data; - struct vfio_device *cur; - list_for_each_entry(cur, &dev_set->device_list, dev_set_list) - if (cur->dev == &pdev->dev) - return 0; - return -EBUSY; + return vfio_find_device_in_devset(dev_set, &pdev->dev) ? 0 : -ENODEV; } /* @@ -2441,7 +2462,8 @@ unwind: * get each memory_lock. */ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, - struct vfio_pci_group_info *groups) + struct vfio_pci_group_info *groups, + struct iommufd_ctx *iommufd_ctx) { struct vfio_pci_core_device *cur_mem; struct vfio_pci_core_device *cur_vma; @@ -2471,11 +2493,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set, goto err_unlock; list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) { + bool owned; + /* - * Test whether all the affected devices are contained by the - * set of groups provided by the user. + * Test whether all the affected devices can be reset by the + * user. + * + * If called from a group opened device and the user provides + * a set of groups, all the devices in the dev_set should be + * contained by the set of groups provided by the user. + * + * If called from a cdev opened device and the user provides + * a zero-length array, all the devices in the dev_set must + * be bound to the same iommufd_ctx as the input iommufd_ctx. + * If there is any device that has not been bound to any + * iommufd_ctx yet, check if its iommu_group has any device + * bound to the input iommufd_ctx. Such devices can be + * considered owned by the input iommufd_ctx as the device + * cannot be owned by another iommufd_ctx when its iommu_group + * is owned. + * + * Otherwise, reset is not allowed. */ - if (!vfio_dev_in_groups(cur_vma, groups)) { + if (iommufd_ctx) { + int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev, + iommufd_ctx); + + owned = (devid > 0 || devid == -ENOENT); + } else { + owned = vfio_dev_in_groups(&cur_vma->vdev, groups); + } + + if (!owned) { ret = -EINVAL; goto err_undo; } diff --git a/drivers/vfio/platform/vfio_amba.c b/drivers/vfio/platform/vfio_amba.c index 83fe54015595..6464b3939ebc 100644 --- a/drivers/vfio/platform/vfio_amba.c +++ b/drivers/vfio/platform/vfio_amba.c @@ -119,6 +119,7 @@ static const struct vfio_device_ops vfio_amba_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static const struct amba_id pl330_ids[] = { diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 22a1efca32a8..8cf22fa65baa 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -108,6 +108,7 @@ static const struct vfio_device_ops vfio_platform_ops = { .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; static struct platform_driver vfio_platform_driver = { diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 7b19c621e0e6..307e3f29b527 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -16,14 +16,32 @@ struct iommufd_ctx; struct iommu_group; struct vfio_container; +struct vfio_device_file { + struct vfio_device *device; + struct vfio_group *group; + + u8 access_granted; + u32 devid; /* only valid when iommufd is valid */ + spinlock_t kvm_ref_lock; /* protect kvm field */ + struct kvm *kvm; + struct iommufd_ctx *iommufd; /* protected by struct vfio_device_set::lock */ +}; + void vfio_device_put_registration(struct vfio_device *device); bool vfio_device_try_get_registration(struct vfio_device *device); -int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd); -void vfio_device_close(struct vfio_device *device, - struct iommufd_ctx *iommufd); +int vfio_df_open(struct vfio_device_file *df); +void vfio_df_close(struct vfio_device_file *df); +struct vfio_device_file * +vfio_allocate_device_file(struct vfio_device *device); extern const struct file_operations vfio_device_fops; +#ifdef CONFIG_VFIO_NOIOMMU +extern bool vfio_noiommu __read_mostly; +#else +enum { vfio_noiommu = false }; +#endif + enum vfio_group_type { /* * Physical device with IOMMU backing. @@ -48,6 +66,7 @@ enum vfio_group_type { VFIO_NO_IOMMU, }; +#if IS_ENABLED(CONFIG_VFIO_GROUP) struct vfio_group { struct device dev; struct cdev cdev; @@ -74,8 +93,11 @@ struct vfio_group { struct blocking_notifier_head notifier; struct iommufd_ctx *iommufd; spinlock_t kvm_ref_lock; + unsigned int cdev_device_open_cnt; }; +int vfio_device_block_group(struct vfio_device *device); +void vfio_device_unblock_group(struct vfio_device *device); int vfio_device_set_group(struct vfio_device *device, enum vfio_group_type type); void vfio_device_remove_group(struct vfio_device *device); @@ -83,7 +105,10 @@ void vfio_device_group_register(struct vfio_device *device); void vfio_device_group_unregister(struct vfio_device *device); int vfio_device_group_use_iommu(struct vfio_device *device); void vfio_device_group_unuse_iommu(struct vfio_device *device); -void vfio_device_group_close(struct vfio_device *device); +void vfio_df_group_close(struct vfio_device_file *df); +struct vfio_group *vfio_group_from_file(struct file *file); +bool vfio_group_enforced_coherent(struct vfio_group *group); +void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm); bool vfio_device_has_container(struct vfio_device *device); int __init vfio_group_init(void); void vfio_group_cleanup(void); @@ -93,6 +118,82 @@ static inline bool vfio_device_is_noiommu(struct vfio_device *vdev) return IS_ENABLED(CONFIG_VFIO_NOIOMMU) && vdev->group->type == VFIO_NO_IOMMU; } +#else +struct vfio_group; + +static inline int vfio_device_block_group(struct vfio_device *device) +{ + return 0; +} + +static inline void vfio_device_unblock_group(struct vfio_device *device) +{ +} + +static inline int vfio_device_set_group(struct vfio_device *device, + enum vfio_group_type type) +{ + return 0; +} + +static inline void vfio_device_remove_group(struct vfio_device *device) +{ +} + +static inline void vfio_device_group_register(struct vfio_device *device) +{ +} + +static inline void vfio_device_group_unregister(struct vfio_device *device) +{ +} + +static inline int vfio_device_group_use_iommu(struct vfio_device *device) +{ + return -EOPNOTSUPP; +} + +static inline void vfio_device_group_unuse_iommu(struct vfio_device *device) +{ +} + +static inline void vfio_df_group_close(struct vfio_device_file *df) +{ +} + +static inline struct vfio_group *vfio_group_from_file(struct file *file) +{ + return NULL; +} + +static inline bool vfio_group_enforced_coherent(struct vfio_group *group) +{ + return true; +} + +static inline void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +{ +} + +static inline bool vfio_device_has_container(struct vfio_device *device) +{ + return false; +} + +static inline int __init vfio_group_init(void) +{ + return 0; +} + +static inline void vfio_group_cleanup(void) +{ +} + +static inline bool vfio_device_is_noiommu(struct vfio_device *vdev) +{ + return false; +} +#endif /* CONFIG_VFIO_GROUP */ #if IS_ENABLED(CONFIG_VFIO_CONTAINER) /** @@ -217,20 +318,109 @@ static inline void vfio_container_cleanup(void) #endif #if IS_ENABLED(CONFIG_IOMMUFD) -int vfio_iommufd_bind(struct vfio_device *device, struct iommufd_ctx *ictx); -void vfio_iommufd_unbind(struct vfio_device *device); +bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, + struct iommufd_ctx *ictx); +int vfio_df_iommufd_bind(struct vfio_device_file *df); +void vfio_df_iommufd_unbind(struct vfio_device_file *df); +int vfio_iommufd_compat_attach_ioas(struct vfio_device *device, + struct iommufd_ctx *ictx); #else -static inline int vfio_iommufd_bind(struct vfio_device *device, +static inline bool +vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, struct iommufd_ctx *ictx) { + return false; +} + +static inline int vfio_df_iommufd_bind(struct vfio_device_file *fd) +{ return -EOPNOTSUPP; } -static inline void vfio_iommufd_unbind(struct vfio_device *device) +static inline void vfio_df_iommufd_unbind(struct vfio_device_file *df) { } + +static inline int +vfio_iommufd_compat_attach_ioas(struct vfio_device *device, + struct iommufd_ctx *ictx) +{ + return -EOPNOTSUPP; +} #endif +int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, + struct vfio_device_attach_iommufd_pt __user *arg); +int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, + struct vfio_device_detach_iommufd_pt __user *arg); + +#if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) +void vfio_init_device_cdev(struct vfio_device *device); + +static inline int vfio_device_add(struct vfio_device *device) +{ + /* cdev does not support noiommu device */ + if (vfio_device_is_noiommu(device)) + return device_add(&device->device); + vfio_init_device_cdev(device); + return cdev_device_add(&device->cdev, &device->device); +} + +static inline void vfio_device_del(struct vfio_device *device) +{ + if (vfio_device_is_noiommu(device)) + device_del(&device->device); + else + cdev_device_del(&device->cdev, &device->device); +} + +int vfio_device_fops_cdev_open(struct inode *inode, struct file *filep); +long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg); +void vfio_df_unbind_iommufd(struct vfio_device_file *df); +int vfio_cdev_init(struct class *device_class); +void vfio_cdev_cleanup(void); +#else +static inline void vfio_init_device_cdev(struct vfio_device *device) +{ +} + +static inline int vfio_device_add(struct vfio_device *device) +{ + return device_add(&device->device); +} + +static inline void vfio_device_del(struct vfio_device *device) +{ + device_del(&device->device); +} + +static inline int vfio_device_fops_cdev_open(struct inode *inode, + struct file *filep) +{ + return 0; +} + +static inline long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, + struct vfio_device_bind_iommufd __user *arg) +{ + return -ENOTTY; +} + +static inline void vfio_df_unbind_iommufd(struct vfio_device_file *df) +{ +} + +static inline int vfio_cdev_init(struct class *device_class) +{ + return 0; +} + +static inline void vfio_cdev_cleanup(void) +{ +} +#endif /* CONFIG_VFIO_DEVICE_CDEV */ + #if IS_ENABLED(CONFIG_VFIO_VIRQFD) int __init vfio_virqfd_init(void); void vfio_virqfd_exit(void); @@ -244,18 +434,12 @@ static inline void vfio_virqfd_exit(void) } #endif -#ifdef CONFIG_VFIO_NOIOMMU -extern bool vfio_noiommu __read_mostly; -#else -enum { vfio_noiommu = false }; -#endif - #ifdef CONFIG_HAVE_KVM -void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); +void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); void vfio_device_put_kvm(struct vfio_device *device); #else -static inline void _vfio_device_get_kvm_safe(struct vfio_device *device, - struct kvm *kvm) +static inline void vfio_device_get_kvm_safe(struct vfio_device *device, + struct kvm *kvm) { } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index ebe0ad31d0b0..eacd6ec04de5 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -2732,7 +2732,7 @@ static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu, static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu, struct vfio_info_cap *caps) { - struct vfio_iommu_type1_info_cap_migration cap_mig; + struct vfio_iommu_type1_info_cap_migration cap_mig = {}; cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION; cap_mig.header.version = 1; @@ -2762,27 +2762,20 @@ static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu, static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu, unsigned long arg) { - struct vfio_iommu_type1_info info; + struct vfio_iommu_type1_info info = {}; unsigned long minsz; struct vfio_info_cap caps = { .buf = NULL, .size = 0 }; - unsigned long capsz; int ret; minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes); - /* For backward compatibility, cannot require this */ - capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset); - if (copy_from_user(&info, (void __user *)arg, minsz)) return -EFAULT; if (info.argsz < minsz) return -EINVAL; - if (info.argsz >= capsz) { - minsz = capsz; - info.cap_offset = 0; /* output, no-recopy necessary */ - } + minsz = min_t(size_t, info.argsz, sizeof(info)); mutex_lock(&iommu->lock); info.flags = VFIO_IOMMU_INFO_PGSIZES; diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index f0ca33b2e1df..cfad824d9aa2 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -141,6 +141,21 @@ unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) } EXPORT_SYMBOL_GPL(vfio_device_set_open_count); +struct vfio_device * +vfio_find_device_in_devset(struct vfio_device_set *dev_set, + struct device *dev) +{ + struct vfio_device *cur; + + lockdep_assert_held(&dev_set->lock); + + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) + if (cur->dev == dev) + return cur; + return NULL; +} +EXPORT_SYMBOL_GPL(vfio_find_device_in_devset); + /* * Device objects - create, release, get, put, search */ @@ -258,7 +273,8 @@ static int __vfio_register_dev(struct vfio_device *device, if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) && (!device->ops->bind_iommufd || !device->ops->unbind_iommufd || - !device->ops->attach_ioas))) + !device->ops->attach_ioas || + !device->ops->detach_ioas))) return -EINVAL; /* @@ -276,7 +292,18 @@ static int __vfio_register_dev(struct vfio_device *device, if (ret) return ret; - ret = device_add(&device->device); + /* + * VFIO always sets IOMMU_CACHE because we offer no way for userspace to + * restore cache coherency. It has to be checked here because it is only + * valid for cases where we are using iommu groups. + */ + if (type == VFIO_IOMMU && !vfio_device_is_noiommu(device) && + !device_iommu_capable(device->dev, IOMMU_CAP_CACHE_COHERENCY)) { + ret = -EINVAL; + goto err_out; + } + + ret = vfio_device_add(device); if (ret) goto err_out; @@ -316,6 +343,18 @@ void vfio_unregister_group_dev(struct vfio_device *device) bool interrupted = false; long rc; + /* + * Prevent new device opened by userspace via the + * VFIO_GROUP_GET_DEVICE_FD in the group path. + */ + vfio_device_group_unregister(device); + + /* + * Balances vfio_device_add() in register path, also prevents + * new device opened by userspace in the cdev path. + */ + vfio_device_del(device); + vfio_device_put_registration(device); rc = try_wait_for_completion(&device->comp); while (rc <= 0) { @@ -339,18 +378,13 @@ void vfio_unregister_group_dev(struct vfio_device *device) } } - vfio_device_group_unregister(device); - - /* Balances device_add in register path */ - device_del(&device->device); - /* Balances vfio_device_set_group in register path */ vfio_device_remove_group(device); } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); #ifdef CONFIG_HAVE_KVM -void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) +void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) { void (*pfn)(struct kvm *kvm); bool (*fn)(struct kvm *kvm); @@ -358,6 +392,9 @@ void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) lockdep_assert_held(&device->dev_set->lock); + if (!kvm) + return; + pfn = symbol_get(kvm_put_kvm); if (WARN_ON(!pfn)) return; @@ -404,9 +441,25 @@ static bool vfio_assert_device_open(struct vfio_device *device) return !WARN_ON_ONCE(!READ_ONCE(device->open_count)); } -static int vfio_device_first_open(struct vfio_device *device, - struct iommufd_ctx *iommufd) +struct vfio_device_file * +vfio_allocate_device_file(struct vfio_device *device) { + struct vfio_device_file *df; + + df = kzalloc(sizeof(*df), GFP_KERNEL_ACCOUNT); + if (!df) + return ERR_PTR(-ENOMEM); + + df->device = device; + spin_lock_init(&df->kvm_ref_lock); + + return df; +} + +static int vfio_df_device_first_open(struct vfio_device_file *df) +{ + struct vfio_device *device = df->device; + struct iommufd_ctx *iommufd = df->iommufd; int ret; lockdep_assert_held(&device->dev_set->lock); @@ -415,7 +468,7 @@ static int vfio_device_first_open(struct vfio_device *device, return -ENODEV; if (iommufd) - ret = vfio_iommufd_bind(device, iommufd); + ret = vfio_df_iommufd_bind(df); else ret = vfio_device_group_use_iommu(device); if (ret) @@ -430,7 +483,7 @@ static int vfio_device_first_open(struct vfio_device *device, err_unuse_iommu: if (iommufd) - vfio_iommufd_unbind(device); + vfio_df_iommufd_unbind(df); else vfio_device_group_unuse_iommu(device); err_module_put: @@ -438,29 +491,39 @@ err_module_put: return ret; } -static void vfio_device_last_close(struct vfio_device *device, - struct iommufd_ctx *iommufd) +static void vfio_df_device_last_close(struct vfio_device_file *df) { + struct vfio_device *device = df->device; + struct iommufd_ctx *iommufd = df->iommufd; + lockdep_assert_held(&device->dev_set->lock); if (device->ops->close_device) device->ops->close_device(device); if (iommufd) - vfio_iommufd_unbind(device); + vfio_df_iommufd_unbind(df); else vfio_device_group_unuse_iommu(device); module_put(device->dev->driver->owner); } -int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) +int vfio_df_open(struct vfio_device_file *df) { + struct vfio_device *device = df->device; int ret = 0; lockdep_assert_held(&device->dev_set->lock); + /* + * Only the group path allows the device to be opened multiple + * times. The device cdev path doesn't have a secure way for it. + */ + if (device->open_count != 0 && !df->group) + return -EINVAL; + device->open_count++; if (device->open_count == 1) { - ret = vfio_device_first_open(device, iommufd); + ret = vfio_df_device_first_open(df); if (ret) device->open_count--; } @@ -468,14 +531,15 @@ int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) return ret; } -void vfio_device_close(struct vfio_device *device, - struct iommufd_ctx *iommufd) +void vfio_df_close(struct vfio_device_file *df) { + struct vfio_device *device = df->device; + lockdep_assert_held(&device->dev_set->lock); vfio_assert_device_open(device); if (device->open_count == 1) - vfio_device_last_close(device, iommufd); + vfio_df_device_last_close(df); device->open_count--; } @@ -517,12 +581,18 @@ static inline void vfio_device_pm_runtime_put(struct vfio_device *device) */ static int vfio_device_fops_release(struct inode *inode, struct file *filep) { - struct vfio_device *device = filep->private_data; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; - vfio_device_group_close(device); + if (df->group) + vfio_df_group_close(df); + else + vfio_df_unbind_iommufd(df); vfio_device_put_registration(device); + kfree(df); + return 0; } @@ -865,6 +935,53 @@ static int vfio_ioctl_device_feature_migration(struct vfio_device *device, return 0; } +void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, + u32 req_nodes) +{ + struct interval_tree_node *prev, *curr, *comb_start, *comb_end; + unsigned long min_gap, curr_gap; + + /* Special shortcut when a single range is required */ + if (req_nodes == 1) { + unsigned long last; + + comb_start = interval_tree_iter_first(root, 0, ULONG_MAX); + curr = comb_start; + while (curr) { + last = curr->last; + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + if (prev != comb_start) + interval_tree_remove(prev, root); + } + comb_start->last = last; + return; + } + + /* Combine ranges which have the smallest gap */ + while (cur_nodes > req_nodes) { + prev = NULL; + min_gap = ULONG_MAX; + curr = interval_tree_iter_first(root, 0, ULONG_MAX); + while (curr) { + if (prev) { + curr_gap = curr->start - prev->last; + if (curr_gap < min_gap) { + min_gap = curr_gap; + comb_start = prev; + comb_end = curr; + } + } + prev = curr; + curr = interval_tree_iter_next(curr, 0, ULONG_MAX); + } + comb_start->last = comb_end->last; + interval_tree_remove(comb_end, root); + cur_nodes--; + } +} +EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges); + /* Ranges should fit into a single kernel page */ #define LOG_MAX_RANGES \ (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range)) @@ -1087,16 +1204,38 @@ static int vfio_ioctl_device_feature(struct vfio_device *device, static long vfio_device_fops_unl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { - struct vfio_device *device = filep->private_data; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; + void __user *uptr = (void __user *)arg; int ret; + if (cmd == VFIO_DEVICE_BIND_IOMMUFD) + return vfio_df_ioctl_bind_iommufd(df, uptr); + + /* Paired with smp_store_release() following vfio_df_open() */ + if (!smp_load_acquire(&df->access_granted)) + return -EINVAL; + ret = vfio_device_pm_runtime_get(device); if (ret) return ret; + /* cdev only ioctls */ + if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) { + switch (cmd) { + case VFIO_DEVICE_ATTACH_IOMMUFD_PT: + ret = vfio_df_ioctl_attach_pt(df, uptr); + goto out; + + case VFIO_DEVICE_DETACH_IOMMUFD_PT: + ret = vfio_df_ioctl_detach_pt(df, uptr); + goto out; + } + } + switch (cmd) { case VFIO_DEVICE_FEATURE: - ret = vfio_ioctl_device_feature(device, (void __user *)arg); + ret = vfio_ioctl_device_feature(device, uptr); break; default: @@ -1106,7 +1245,7 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, ret = device->ops->ioctl(device, cmd, arg); break; } - +out: vfio_device_pm_runtime_put(device); return ret; } @@ -1114,7 +1253,12 @@ static long vfio_device_fops_unl_ioctl(struct file *filep, static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf, size_t count, loff_t *ppos) { - struct vfio_device *device = filep->private_data; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; + + /* Paired with smp_store_release() following vfio_df_open() */ + if (!smp_load_acquire(&df->access_granted)) + return -EINVAL; if (unlikely(!device->ops->read)) return -EINVAL; @@ -1126,7 +1270,12 @@ static ssize_t vfio_device_fops_write(struct file *filep, const char __user *buf, size_t count, loff_t *ppos) { - struct vfio_device *device = filep->private_data; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; + + /* Paired with smp_store_release() following vfio_df_open() */ + if (!smp_load_acquire(&df->access_granted)) + return -EINVAL; if (unlikely(!device->ops->write)) return -EINVAL; @@ -1136,7 +1285,12 @@ static ssize_t vfio_device_fops_write(struct file *filep, static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) { - struct vfio_device *device = filep->private_data; + struct vfio_device_file *df = filep->private_data; + struct vfio_device *device = df->device; + + /* Paired with smp_store_release() following vfio_df_open() */ + if (!smp_load_acquire(&df->access_granted)) + return -EINVAL; if (unlikely(!device->ops->mmap)) return -EINVAL; @@ -1146,6 +1300,7 @@ static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma) const struct file_operations vfio_device_fops = { .owner = THIS_MODULE, + .open = vfio_device_fops_cdev_open, .release = vfio_device_fops_release, .read = vfio_device_fops_read, .write = vfio_device_fops_write, @@ -1154,6 +1309,88 @@ const struct file_operations vfio_device_fops = { .mmap = vfio_device_fops_mmap, }; +static struct vfio_device *vfio_device_from_file(struct file *file) +{ + struct vfio_device_file *df = file->private_data; + + if (file->f_op != &vfio_device_fops) + return NULL; + return df->device; +} + +/** + * vfio_file_is_valid - True if the file is valid vfio file + * @file: VFIO group file or VFIO device file + */ +bool vfio_file_is_valid(struct file *file) +{ + return vfio_group_from_file(file) || + vfio_device_from_file(file); +} +EXPORT_SYMBOL_GPL(vfio_file_is_valid); + +/** + * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file + * is always CPU cache coherent + * @file: VFIO group file or VFIO device file + * + * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop + * bit in DMA transactions. A return of false indicates that the user has + * rights to access additional instructions such as wbinvd on x86. + */ +bool vfio_file_enforced_coherent(struct file *file) +{ + struct vfio_device *device; + struct vfio_group *group; + + group = vfio_group_from_file(file); + if (group) + return vfio_group_enforced_coherent(group); + + device = vfio_device_from_file(file); + if (device) + return device_iommu_capable(device->dev, + IOMMU_CAP_ENFORCE_CACHE_COHERENCY); + + return true; +} +EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent); + +static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_device_file *df = file->private_data; + + /* + * The kvm is first recorded in the vfio_device_file, and will + * be propagated to vfio_device::kvm when the file is bound to + * iommufd successfully in the vfio device cdev path. + */ + spin_lock(&df->kvm_ref_lock); + df->kvm = kvm; + spin_unlock(&df->kvm_ref_lock); +} + +/** + * vfio_file_set_kvm - Link a kvm with VFIO drivers + * @file: VFIO group file or VFIO device file + * @kvm: KVM to link + * + * When a VFIO device is first opened the KVM will be available in + * device->kvm if one was associated with the file. + */ +void vfio_file_set_kvm(struct file *file, struct kvm *kvm) +{ + struct vfio_group *group; + + group = vfio_group_from_file(file); + if (group) + vfio_group_set_kvm(group, kvm); + + if (vfio_device_from_file(file)) + vfio_device_file_set_kvm(file, kvm); +} +EXPORT_SYMBOL_GPL(vfio_file_set_kvm); + /* * Sub-module support */ @@ -1172,6 +1409,9 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, void *buf; struct vfio_info_cap_header *header, *tmp; + /* Ensure that the next capability struct will be aligned */ + size = ALIGN(size, sizeof(u64)); + buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL); if (!buf) { kfree(caps->buf); @@ -1205,6 +1445,9 @@ void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) struct vfio_info_cap_header *tmp; void *buf = (void *)caps->buf; + /* Capability structs should start with proper alignment */ + WARN_ON(!IS_ALIGNED(offset, sizeof(u64))); + for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) tmp->next += offset; } @@ -1415,9 +1658,16 @@ static int __init vfio_init(void) goto err_dev_class; } + ret = vfio_cdev_init(vfio.device_class); + if (ret) + goto err_alloc_dev_chrdev; + pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n"); return 0; +err_alloc_dev_chrdev: + class_destroy(vfio.device_class); + vfio.device_class = NULL; err_dev_class: vfio_virqfd_exit(); err_virqfd: @@ -1428,6 +1678,7 @@ err_virqfd: static void __exit vfio_cleanup(void) { ida_destroy(&vfio.device_ida); + vfio_cdev_cleanup(); class_destroy(vfio.device_class); vfio.device_class = NULL; vfio_virqfd_exit(); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 1129a36a74c4..9657c58813dc 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -16,6 +16,7 @@ struct page; struct iommufd_ctx; struct iommufd_access; struct file; +struct iommu_group; struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx, struct device *dev, u32 *id); @@ -24,6 +25,9 @@ void iommufd_device_unbind(struct iommufd_device *idev); int iommufd_device_attach(struct iommufd_device *idev, u32 *pt_id); void iommufd_device_detach(struct iommufd_device *idev); +struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); +u32 iommufd_device_to_id(struct iommufd_device *idev); + struct iommufd_access_ops { u8 needs_pin_pages : 1; void (*unmap)(void *data, unsigned long iova, unsigned long length); @@ -44,12 +48,15 @@ iommufd_access_create(struct iommufd_ctx *ictx, const struct iommufd_access_ops *ops, void *data, u32 *id); void iommufd_access_destroy(struct iommufd_access *access); int iommufd_access_attach(struct iommufd_access *access, u32 ioas_id); +void iommufd_access_detach(struct iommufd_access *access); void iommufd_ctx_get(struct iommufd_ctx *ictx); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_ctx *iommufd_ctx_from_file(struct file *file); +struct iommufd_ctx *iommufd_ctx_from_fd(int fd); void iommufd_ctx_put(struct iommufd_ctx *ictx); +bool iommufd_ctx_has_group(struct iommufd_ctx *ictx, struct iommu_group *group); int iommufd_access_pin_pages(struct iommufd_access *access, unsigned long iova, unsigned long length, struct page **out_pages, diff --git a/include/linux/pds/pds_adminq.h b/include/linux/pds/pds_adminq.h index bcba7fda3cc9..4b4e9a98b37b 100644 --- a/include/linux/pds/pds_adminq.h +++ b/include/linux/pds/pds_adminq.h @@ -818,6 +818,367 @@ struct pds_vdpa_set_features_cmd { __le64 features; }; +#define PDS_LM_DEVICE_STATE_LENGTH 65536 +#define PDS_LM_CHECK_DEVICE_STATE_LENGTH(X) \ + PDS_CORE_SIZE_CHECK(union, PDS_LM_DEVICE_STATE_LENGTH, X) + +/* + * enum pds_lm_cmd_opcode - Live Migration Device commands + */ +enum pds_lm_cmd_opcode { + PDS_LM_CMD_HOST_VF_STATUS = 1, + + /* Device state commands */ + PDS_LM_CMD_STATE_SIZE = 16, + PDS_LM_CMD_SUSPEND = 18, + PDS_LM_CMD_SUSPEND_STATUS = 19, + PDS_LM_CMD_RESUME = 20, + PDS_LM_CMD_SAVE = 21, + PDS_LM_CMD_RESTORE = 22, + + /* Dirty page tracking commands */ + PDS_LM_CMD_DIRTY_STATUS = 32, + PDS_LM_CMD_DIRTY_ENABLE = 33, + PDS_LM_CMD_DIRTY_DISABLE = 34, + PDS_LM_CMD_DIRTY_READ_SEQ = 35, + PDS_LM_CMD_DIRTY_WRITE_ACK = 36, +}; + +/** + * struct pds_lm_cmd - generic command + * @opcode: Opcode + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Structure padding to 60 Bytes + */ +struct pds_lm_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[56]; +}; + +/** + * struct pds_lm_state_size_cmd - STATE_SIZE command + * @opcode: Opcode + * @rsvd: Word boundary padding + * @vf_id: VF id + */ +struct pds_lm_state_size_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_state_size_comp - STATE_SIZE command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the desc ring for which this is the completion + * @size: Size of the device state + * @rsvd2: Word boundary padding + * @color: Color bit + */ +struct pds_lm_state_size_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + union { + __le64 size; + u8 rsvd2[11]; + } __packed; + u8 color; +}; + +enum pds_lm_suspend_resume_type { + PDS_LM_SUSPEND_RESUME_TYPE_FULL = 0, + PDS_LM_SUSPEND_RESUME_TYPE_P2P = 1, +}; + +/** + * struct pds_lm_suspend_cmd - SUSPEND command + * @opcode: Opcode PDS_LM_CMD_SUSPEND + * @rsvd: Word boundary padding + * @vf_id: VF id + * @type: Type of suspend (enum pds_lm_suspend_resume_type) + */ +struct pds_lm_suspend_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 type; +}; + +/** + * struct pds_lm_suspend_status_cmd - SUSPEND status command + * @opcode: Opcode PDS_AQ_CMD_LM_SUSPEND_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + * @type: Type of suspend (enum pds_lm_suspend_resume_type) + */ +struct pds_lm_suspend_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 type; +}; + +/** + * struct pds_lm_resume_cmd - RESUME command + * @opcode: Opcode PDS_LM_CMD_RESUME + * @rsvd: Word boundary padding + * @vf_id: VF id + * @type: Type of resume (enum pds_lm_suspend_resume_type) + */ +struct pds_lm_resume_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 type; +}; + +/** + * struct pds_lm_sg_elem - Transmit scatter-gather (SG) descriptor element + * @addr: DMA address of SG element data buffer + * @len: Length of SG element data buffer, in bytes + * @rsvd: Word boundary padding + */ +struct pds_lm_sg_elem { + __le64 addr; + __le32 len; + __le16 rsvd[2]; +}; + +/** + * struct pds_lm_save_cmd - SAVE command + * @opcode: Opcode PDS_LM_CMD_SAVE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Word boundary padding + * @sgl_addr: IOVA address of the SGL to dma the device state + * @num_sge: Total number of SG elements + */ +struct pds_lm_save_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[4]; + __le64 sgl_addr; + __le32 num_sge; +} __packed; + +/** + * struct pds_lm_restore_cmd - RESTORE command + * @opcode: Opcode PDS_LM_CMD_RESTORE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @rsvd2: Word boundary padding + * @sgl_addr: IOVA address of the SGL to dma the device state + * @num_sge: Total number of SG elements + */ +struct pds_lm_restore_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 rsvd2[4]; + __le64 sgl_addr; + __le32 num_sge; +} __packed; + +/** + * union pds_lm_dev_state - device state information + * @words: Device state words + */ +union pds_lm_dev_state { + __le32 words[PDS_LM_DEVICE_STATE_LENGTH / sizeof(__le32)]; +}; + +enum pds_lm_host_vf_status { + PDS_LM_STA_NONE = 0, + PDS_LM_STA_IN_PROGRESS, + PDS_LM_STA_MAX, +}; + +/** + * struct pds_lm_dirty_region_info - Memory region info for STATUS and ENABLE + * @dma_base: Base address of the DMA-contiguous memory region + * @page_count: Number of pages in the memory region + * @page_size_log2: Log2 page size in the memory region + * @rsvd: Word boundary padding + */ +struct pds_lm_dirty_region_info { + __le64 dma_base; + __le32 page_count; + u8 page_size_log2; + u8 rsvd[3]; +}; + +/** + * struct pds_lm_dirty_status_cmd - DIRTY_STATUS command + * @opcode: Opcode PDS_LM_CMD_DIRTY_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + * @max_regions: Capacity of the region info buffer + * @rsvd2: Word boundary padding + * @regions_dma: DMA address of the region info buffer + * + * The minimum of max_regions (from the command) and num_regions (from the + * completion) of struct pds_lm_dirty_region_info will be written to + * regions_dma. + * + * The max_regions may be zero, in which case regions_dma is ignored. In that + * case, the completion will only report the maximum number of regions + * supported by the device, and the number of regions currently enabled. + */ +struct pds_lm_dirty_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 max_regions; + u8 rsvd2[3]; + __le64 regions_dma; +} __packed; + +/** + * enum pds_lm_dirty_bmp_type - Type of dirty page bitmap + * @PDS_LM_DIRTY_BMP_TYPE_NONE: No bitmap / disabled + * @PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK: Seq/Ack bitmap representation + */ +enum pds_lm_dirty_bmp_type { + PDS_LM_DIRTY_BMP_TYPE_NONE = 0, + PDS_LM_DIRTY_BMP_TYPE_SEQ_ACK = 1, +}; + +/** + * struct pds_lm_dirty_status_comp - STATUS command completion + * @status: Status of the command (enum pds_core_status_code) + * @rsvd: Word boundary padding + * @comp_index: Index in the desc ring for which this is the completion + * @max_regions: Maximum number of regions supported by the device + * @num_regions: Number of regions currently enabled + * @bmp_type: Type of dirty bitmap representation + * @rsvd2: Word boundary padding + * @bmp_type_mask: Mask of supported bitmap types, bit index per type + * @rsvd3: Word boundary padding + * @color: Color bit + * + * This completion descriptor is used for STATUS, ENABLE, and DISABLE. + */ +struct pds_lm_dirty_status_comp { + u8 status; + u8 rsvd; + __le16 comp_index; + u8 max_regions; + u8 num_regions; + u8 bmp_type; + u8 rsvd2; + __le32 bmp_type_mask; + u8 rsvd3[3]; + u8 color; +}; + +/** + * struct pds_lm_dirty_enable_cmd - DIRTY_ENABLE command + * @opcode: Opcode PDS_LM_CMD_DIRTY_ENABLE + * @rsvd: Word boundary padding + * @vf_id: VF id + * @bmp_type: Type of dirty bitmap representation + * @num_regions: Number of entries in the region info buffer + * @rsvd2: Word boundary padding + * @regions_dma: DMA address of the region info buffer + * + * The num_regions must be nonzero, and less than or equal to the maximum + * number of regions supported by the device. + * + * The memory regions should not overlap. + * + * The information should be initialized by the driver. The device may modify + * the information on successful completion, such as by size-aligning the + * number of pages in a region. + * + * The modified number of pages will be greater than or equal to the page count + * given in the enable command, and at least as coarsly aligned as the given + * value. For example, the count might be aligned to a multiple of 64, but + * if the value is already a multiple of 128 or higher, it will not change. + * If the driver requires its own minimum alignment of the number of pages, the + * driver should account for that already in the region info of this command. + * + * This command uses struct pds_lm_dirty_status_comp for its completion. + */ +struct pds_lm_dirty_enable_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 bmp_type; + u8 num_regions; + u8 rsvd2[2]; + __le64 regions_dma; +} __packed; + +/** + * struct pds_lm_dirty_disable_cmd - DIRTY_DISABLE command + * @opcode: Opcode PDS_LM_CMD_DIRTY_DISABLE + * @rsvd: Word boundary padding + * @vf_id: VF id + * + * Dirty page tracking will be disabled. This may be called in any state, as + * long as dirty page tracking is supported by the device, to ensure that dirty + * page tracking is disabled. + * + * This command uses struct pds_lm_dirty_status_comp for its completion. On + * success, num_regions will be zero. + */ +struct pds_lm_dirty_disable_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; +}; + +/** + * struct pds_lm_dirty_seq_ack_cmd - DIRTY_READ_SEQ or _WRITE_ACK command + * @opcode: Opcode PDS_LM_CMD_DIRTY_[READ_SEQ|WRITE_ACK] + * @rsvd: Word boundary padding + * @vf_id: VF id + * @off_bytes: Byte offset in the bitmap + * @len_bytes: Number of bytes to transfer + * @num_sge: Number of DMA scatter gather elements + * @rsvd2: Word boundary padding + * @sgl_addr: DMA address of scatter gather list + * + * Read bytes from the SEQ bitmap, or write bytes into the ACK bitmap. + * + * This command treats the entire bitmap as a byte buffer. It does not + * distinguish between guest memory regions. The driver should refer to the + * number of pages in each region, according to PDS_LM_CMD_DIRTY_STATUS, to + * determine the region boundaries in the bitmap. Each region will be + * represented by exactly the number of bits as the page count for that region, + * immediately following the last bit of the previous region. + */ +struct pds_lm_dirty_seq_ack_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + __le32 off_bytes; + __le32 len_bytes; + __le16 num_sge; + u8 rsvd2[2]; + __le64 sgl_addr; +} __packed; + +/** + * struct pds_lm_host_vf_status_cmd - HOST_VF_STATUS command + * @opcode: Opcode PDS_LM_CMD_HOST_VF_STATUS + * @rsvd: Word boundary padding + * @vf_id: VF id + * @status: Current LM status of host VF driver (enum pds_lm_host_status) + */ +struct pds_lm_host_vf_status_cmd { + u8 opcode; + u8 rsvd; + __le16 vf_id; + u8 status; +}; + union pds_core_adminq_cmd { u8 opcode; u8 bytes[64]; @@ -844,6 +1205,17 @@ union pds_core_adminq_cmd { struct pds_vdpa_vq_init_cmd vdpa_vq_init; struct pds_vdpa_vq_reset_cmd vdpa_vq_reset; + struct pds_lm_suspend_cmd lm_suspend; + struct pds_lm_suspend_status_cmd lm_suspend_status; + struct pds_lm_resume_cmd lm_resume; + struct pds_lm_state_size_cmd lm_state_size; + struct pds_lm_save_cmd lm_save; + struct pds_lm_restore_cmd lm_restore; + struct pds_lm_host_vf_status_cmd lm_host_vf_status; + struct pds_lm_dirty_status_cmd lm_dirty_status; + struct pds_lm_dirty_enable_cmd lm_dirty_enable; + struct pds_lm_dirty_disable_cmd lm_dirty_disable; + struct pds_lm_dirty_seq_ack_cmd lm_dirty_seq_ack; }; union pds_core_adminq_comp { @@ -868,6 +1240,9 @@ union pds_core_adminq_comp { struct pds_vdpa_vq_init_comp vdpa_vq_init; struct pds_vdpa_vq_reset_comp vdpa_vq_reset; + + struct pds_lm_state_size_comp lm_state_size; + struct pds_lm_dirty_status_comp lm_dirty_status; }; #ifndef __CHECKER__ diff --git a/include/linux/pds/pds_common.h b/include/linux/pds/pds_common.h index 435c8e8161c2..30581e2e04cc 100644 --- a/include/linux/pds/pds_common.h +++ b/include/linux/pds/pds_common.h @@ -34,16 +34,19 @@ enum pds_core_vif_types { #define PDS_DEV_TYPE_CORE_STR "Core" #define PDS_DEV_TYPE_VDPA_STR "vDPA" -#define PDS_DEV_TYPE_VFIO_STR "VFio" +#define PDS_DEV_TYPE_VFIO_STR "vfio" #define PDS_DEV_TYPE_ETH_STR "Eth" #define PDS_DEV_TYPE_RDMA_STR "RDMA" #define PDS_DEV_TYPE_LM_STR "LM" #define PDS_VDPA_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_VDPA_STR +#define PDS_VFIO_LM_DEV_NAME PDS_CORE_DRV_NAME "." PDS_DEV_TYPE_LM_STR "." PDS_DEV_TYPE_VFIO_STR + +struct pdsc; int pdsc_register_notify(struct notifier_block *nb); void pdsc_unregister_notify(struct notifier_block *nb); void *pdsc_get_pf_struct(struct pci_dev *vf_pdev); -int pds_client_register(struct pci_dev *pf_pdev, char *devname); -int pds_client_unregister(struct pci_dev *pf_pdev, u16 client_id); +int pds_client_register(struct pdsc *pf, char *devname); +int pds_client_unregister(struct pdsc *pf, u16 client_id); #endif /* _PDS_COMMON_H_ */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 2c137ea94a3e..454e9295970c 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -13,6 +13,7 @@ #include <linux/mm.h> #include <linux/workqueue.h> #include <linux/poll.h> +#include <linux/cdev.h> #include <uapi/linux/vfio.h> #include <linux/iova_bitmap.h> @@ -42,7 +43,11 @@ struct vfio_device { */ const struct vfio_migration_ops *mig_ops; const struct vfio_log_ops *log_ops; +#if IS_ENABLED(CONFIG_VFIO_GROUP) struct vfio_group *group; + struct list_head group_next; + struct list_head iommu_entry; +#endif struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; @@ -51,17 +56,19 @@ struct vfio_device { /* Members below here are private, not for driver use */ unsigned int index; struct device device; /* device.kref covers object life circle */ +#if IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) + struct cdev cdev; +#endif refcount_t refcount; /* user count on registered device*/ unsigned int open_count; struct completion comp; - struct list_head group_next; - struct list_head iommu_entry; struct iommufd_access *iommufd_access; void (*put_kvm)(struct kvm *kvm); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; - bool iommufd_attached; + u8 iommufd_attached:1; #endif + u8 cdev_opened:1; }; /** @@ -73,7 +80,9 @@ struct vfio_device { * @bind_iommufd: Called when binding the device to an iommufd * @unbind_iommufd: Opposite of bind_iommufd * @attach_ioas: Called when attaching device to an IOAS/HWPT managed by the - * bound iommufd. Undo in unbind_iommufd. + * bound iommufd. Undo in unbind_iommufd if @detach_ioas is not + * called. + * @detach_ioas: Opposite of attach_ioas * @open_device: Called when the first file descriptor is opened for this device * @close_device: Opposite of open_device * @read: Perform read(2) on device file descriptor @@ -97,6 +106,7 @@ struct vfio_device_ops { struct iommufd_ctx *ictx, u32 *out_device_id); void (*unbind_iommufd)(struct vfio_device *vdev); int (*attach_ioas)(struct vfio_device *vdev, u32 *pt_id); + void (*detach_ioas)(struct vfio_device *vdev); int (*open_device)(struct vfio_device *vdev); void (*close_device)(struct vfio_device *vdev); ssize_t (*read)(struct vfio_device *vdev, char __user *buf, @@ -114,15 +124,31 @@ struct vfio_device_ops { }; #if IS_ENABLED(CONFIG_IOMMUFD) +struct iommufd_ctx *vfio_iommufd_device_ictx(struct vfio_device *vdev); +int vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx); int vfio_iommufd_physical_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_physical_unbind(struct vfio_device *vdev); int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev); int vfio_iommufd_emulated_bind(struct vfio_device *vdev, struct iommufd_ctx *ictx, u32 *out_device_id); void vfio_iommufd_emulated_unbind(struct vfio_device *vdev); int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); +void vfio_iommufd_emulated_detach_ioas(struct vfio_device *vdev); #else +static inline struct iommufd_ctx * +vfio_iommufd_device_ictx(struct vfio_device *vdev) +{ + return NULL; +} + +static inline int +vfio_iommufd_get_dev_id(struct vfio_device *vdev, struct iommufd_ctx *ictx) +{ + return VFIO_PCI_DEVID_NOT_OWNED; +} + #define vfio_iommufd_physical_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) @@ -130,6 +156,8 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_physical_attach_ioas \ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#define vfio_iommufd_physical_detach_ioas \ + ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_emulated_bind \ ((int (*)(struct vfio_device *vdev, struct iommufd_ctx *ictx, \ u32 *out_device_id)) NULL) @@ -137,8 +165,15 @@ int vfio_iommufd_emulated_attach_ioas(struct vfio_device *vdev, u32 *pt_id); ((void (*)(struct vfio_device *vdev)) NULL) #define vfio_iommufd_emulated_attach_ioas \ ((int (*)(struct vfio_device *vdev, u32 *pt_id)) NULL) +#define vfio_iommufd_emulated_detach_ioas \ + ((void (*)(struct vfio_device *vdev)) NULL) #endif +static inline bool vfio_device_cdev_opened(struct vfio_device *device) +{ + return device->cdev_opened; +} + /** * struct vfio_migration_ops - VFIO bus device driver migration callbacks * @@ -239,20 +274,44 @@ void vfio_unregister_group_dev(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); +struct vfio_device * +vfio_find_device_in_devset(struct vfio_device_set *dev_set, + struct device *dev); int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state cur_fsm, enum vfio_device_mig_state new_fsm, enum vfio_device_mig_state *next_fsm); +void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes, + u32 req_nodes); + /* * External user API */ +#if IS_ENABLED(CONFIG_VFIO_GROUP) struct iommu_group *vfio_file_iommu_group(struct file *file); bool vfio_file_is_group(struct file *file); +bool vfio_file_has_dev(struct file *file, struct vfio_device *device); +#else +static inline struct iommu_group *vfio_file_iommu_group(struct file *file) +{ + return NULL; +} + +static inline bool vfio_file_is_group(struct file *file) +{ + return false; +} + +static inline bool vfio_file_has_dev(struct file *file, struct vfio_device *device) +{ + return false; +} +#endif +bool vfio_file_is_valid(struct file *file); bool vfio_file_enforced_coherent(struct file *file); void vfio_file_set_kvm(struct file *file, struct kvm *kvm); -bool vfio_file_has_dev(struct file *file, struct vfio_device *device); #define VFIO_PIN_PAGES_MAX_ENTRIES (PAGE_SIZE/sizeof(unsigned long)) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f089ab290978..13065dd96132 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1418,9 +1418,16 @@ struct kvm_device_attr { __u64 addr; /* userspace address of attr data */ }; -#define KVM_DEV_VFIO_GROUP 1 -#define KVM_DEV_VFIO_GROUP_ADD 1 -#define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_FILE 1 + +#define KVM_DEV_VFIO_FILE_ADD 1 +#define KVM_DEV_VFIO_FILE_DEL 2 + +/* KVM_DEV_VFIO_GROUP aliases are for compile time uapi compatibility */ +#define KVM_DEV_VFIO_GROUP KVM_DEV_VFIO_FILE + +#define KVM_DEV_VFIO_GROUP_ADD KVM_DEV_VFIO_FILE_ADD +#define KVM_DEV_VFIO_GROUP_DEL KVM_DEV_VFIO_FILE_DEL #define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 enum kvm_device_type { diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 20c804bdc09c..f9c6f3e2cf6e 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -217,6 +217,7 @@ struct vfio_device_info { __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; }; #define VFIO_DEVICE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 7) @@ -677,11 +678,60 @@ enum { * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) * + * This command is used to query the affected devices in the hot reset for + * a given device. + * + * This command always reports the segment, bus, and devfn information for + * each affected device, and selectively reports the group_id or devid per + * the way how the calling device is opened. + * + * - If the calling device is opened via the traditional group/container + * API, group_id is reported. User should check if it has owned all + * the affected devices and provides a set of group fds to prove the + * ownership in VFIO_DEVICE_PCI_HOT_RESET ioctl. + * + * - If the calling device is opened as a cdev, devid is reported. + * Flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set to indicate this + * data type. All the affected devices should be represented in + * the dev_set, ex. bound to a vfio driver, and also be owned by + * this interface which is determined by the following conditions: + * 1) Has a valid devid within the iommufd_ctx of the calling device. + * Ownership cannot be determined across separate iommufd_ctx and + * the cdev calling conventions do not support a proof-of-ownership + * model as provided in the legacy group interface. In this case + * valid devid with value greater than zero is provided in the return + * structure. + * 2) Does not have a valid devid within the iommufd_ctx of the calling + * device, but belongs to the same IOMMU group as the calling device + * or another opened device that has a valid devid within the + * iommufd_ctx of the calling device. This provides implicit ownership + * for devices within the same DMA isolation context. In this case + * the devid value of VFIO_PCI_DEVID_OWNED is provided in the return + * structure. + * + * A devid value of VFIO_PCI_DEVID_NOT_OWNED is provided in the return + * structure for affected devices where device is NOT represented in the + * dev_set or ownership is not available. Such devices prevent the use + * of VFIO_DEVICE_PCI_HOT_RESET ioctl outside of the proof-of-ownership + * calling conventions (ie. via legacy group accessed devices). Flag + * VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED would be set when all the + * affected devices are represented in the dev_set and also owned by + * the user. This flag is available only when + * flag VFIO_PCI_HOT_RESET_FLAG_DEV_ID is set, otherwise reserved. + * When set, user could invoke VFIO_DEVICE_PCI_HOT_RESET with a zero + * length fd array on the calling device as the ownership is validated + * by iommufd_ctx. + * * Return: 0 on success, -errno on failure: * -enospc = insufficient buffer, -enodev = unsupported for device. */ struct vfio_pci_dependent_device { - __u32 group_id; + union { + __u32 group_id; + __u32 devid; +#define VFIO_PCI_DEVID_OWNED 0 +#define VFIO_PCI_DEVID_NOT_OWNED -1 + }; __u16 segment; __u8 bus; __u8 devfn; /* Use PCI_SLOT/PCI_FUNC */ @@ -690,6 +740,8 @@ struct vfio_pci_dependent_device { struct vfio_pci_hot_reset_info { __u32 argsz; __u32 flags; +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID (1 << 0) +#define VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED (1 << 1) __u32 count; struct vfio_pci_dependent_device devices[]; }; @@ -700,6 +752,24 @@ struct vfio_pci_hot_reset_info { * VFIO_DEVICE_PCI_HOT_RESET - _IOW(VFIO_TYPE, VFIO_BASE + 13, * struct vfio_pci_hot_reset) * + * A PCI hot reset results in either a bus or slot reset which may affect + * other devices sharing the bus/slot. The calling user must have + * ownership of the full set of affected devices as determined by the + * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO ioctl. + * + * When called on a device file descriptor acquired through the vfio + * group interface, the user is required to provide proof of ownership + * of those affected devices via the group_fds array in struct + * vfio_pci_hot_reset. + * + * When called on a direct cdev opened vfio device, the flags field of + * struct vfio_pci_hot_reset_info reports the ownership status of the + * affected devices and this ioctl must be called with an empty group_fds + * array. See above INFO ioctl definition for ownership requirements. + * + * Mixed usage of legacy groups and cdevs across the set of affected + * devices is not supported. + * * Return: 0 on success, -errno on failure. */ struct vfio_pci_hot_reset { @@ -829,6 +899,77 @@ struct vfio_device_feature { #define VFIO_DEVICE_FEATURE _IO(VFIO_TYPE, VFIO_BASE + 17) /* + * VFIO_DEVICE_BIND_IOMMUFD - _IOR(VFIO_TYPE, VFIO_BASE + 18, + * struct vfio_device_bind_iommufd) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * @iommufd: iommufd to bind. + * @out_devid: The device id generated by this bind. devid is a handle for + * this device/iommufd bond and can be used in IOMMUFD commands. + * + * Bind a vfio_device to the specified iommufd. + * + * User is restricted from accessing the device before the binding operation + * is completed. Only allowed on cdev fds. + * + * Unbind is automatically conducted when device fd is closed. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_bind_iommufd { + __u32 argsz; + __u32 flags; + __s32 iommufd; + __u32 out_devid; +}; + +#define VFIO_DEVICE_BIND_IOMMUFD _IO(VFIO_TYPE, VFIO_BASE + 18) + +/* + * VFIO_DEVICE_ATTACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 19, + * struct vfio_device_attach_iommufd_pt) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * @pt_id: Input the target id which can represent an ioas or a hwpt + * allocated via iommufd subsystem. + * Output the input ioas id or the attached hwpt id which could + * be the specified hwpt itself or a hwpt automatically created + * for the specified ioas by kernel during the attachment. + * + * Associate the device with an address space within the bound iommufd. + * Undo by VFIO_DEVICE_DETACH_IOMMUFD_PT or device fd close. This is only + * allowed on cdev fds. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_attach_iommufd_pt { + __u32 argsz; + __u32 flags; + __u32 pt_id; +}; + +#define VFIO_DEVICE_ATTACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 19) + +/* + * VFIO_DEVICE_DETACH_IOMMUFD_PT - _IOW(VFIO_TYPE, VFIO_BASE + 20, + * struct vfio_device_detach_iommufd_pt) + * @argsz: User filled size of this data. + * @flags: Must be 0. + * + * Remove the association of the device and its current associated address + * space. After it, the device should be in a blocking DMA state. This is only + * allowed on cdev fds. + * + * Return: 0 on success, -errno on failure. + */ +struct vfio_device_detach_iommufd_pt { + __u32 argsz; + __u32 flags; +}; + +#define VFIO_DEVICE_DETACH_IOMMUFD_PT _IO(VFIO_TYPE, VFIO_BASE + 20) + +/* * Provide support for setting a PCI VF Token, which is used as a shared * secret between PF and VF drivers. This feature may only be set on a * PCI SR-IOV PF when SR-IOV is enabled on the PF and there are no existing @@ -1304,6 +1445,7 @@ struct vfio_iommu_type1_info { #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ __u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ + __u32 pad; }; /* diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c index c6c6b5d26670..3764d1911b51 100644 --- a/samples/vfio-mdev/mbochs.c +++ b/samples/vfio-mdev/mbochs.c @@ -1377,6 +1377,7 @@ static const struct vfio_device_ops mbochs_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, }; static struct mdev_driver mbochs_driver = { diff --git a/samples/vfio-mdev/mdpy.c b/samples/vfio-mdev/mdpy.c index a62ea11e20ec..064e1c0a7aa8 100644 --- a/samples/vfio-mdev/mdpy.c +++ b/samples/vfio-mdev/mdpy.c @@ -666,6 +666,7 @@ static const struct vfio_device_ops mdpy_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, }; static struct mdev_driver mdpy_driver = { diff --git a/samples/vfio-mdev/mtty.c b/samples/vfio-mdev/mtty.c index a60801fb8660..5af00387c519 100644 --- a/samples/vfio-mdev/mtty.c +++ b/samples/vfio-mdev/mtty.c @@ -1272,6 +1272,7 @@ static const struct vfio_device_ops mtty_dev_ops = { .bind_iommufd = vfio_iommufd_emulated_bind, .unbind_iommufd = vfio_iommufd_emulated_unbind, .attach_ioas = vfio_iommufd_emulated_attach_ioas, + .detach_ioas = vfio_iommufd_emulated_detach_ioas, }; static struct mdev_driver mtty_driver = { diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 9584eb57e0ed..ca24ce120906 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -21,7 +21,7 @@ #include <asm/kvm_ppc.h> #endif -struct kvm_vfio_group { +struct kvm_vfio_file { struct list_head node; struct file *file; #ifdef CONFIG_SPAPR_TCE_IOMMU @@ -30,7 +30,7 @@ struct kvm_vfio_group { }; struct kvm_vfio { - struct list_head group_list; + struct list_head file_list; struct mutex lock; bool noncoherent; }; @@ -64,18 +64,18 @@ static bool kvm_vfio_file_enforced_coherent(struct file *file) return ret; } -static bool kvm_vfio_file_is_group(struct file *file) +static bool kvm_vfio_file_is_valid(struct file *file) { bool (*fn)(struct file *file); bool ret; - fn = symbol_get(vfio_file_is_group); + fn = symbol_get(vfio_file_is_valid); if (!fn) return false; ret = fn(file); - symbol_put(vfio_file_is_group); + symbol_put(vfio_file_is_valid); return ret; } @@ -98,34 +98,33 @@ static struct iommu_group *kvm_vfio_file_iommu_group(struct file *file) } static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, - struct kvm_vfio_group *kvg) + struct kvm_vfio_file *kvf) { - if (WARN_ON_ONCE(!kvg->iommu_group)) + if (WARN_ON_ONCE(!kvf->iommu_group)) return; - kvm_spapr_tce_release_iommu_group(kvm, kvg->iommu_group); - iommu_group_put(kvg->iommu_group); - kvg->iommu_group = NULL; + kvm_spapr_tce_release_iommu_group(kvm, kvf->iommu_group); + iommu_group_put(kvf->iommu_group); + kvf->iommu_group = NULL; } #endif /* - * Groups can use the same or different IOMMU domains. If the same then - * adding a new group may change the coherency of groups we've previously - * been told about. We don't want to care about any of that so we retest - * each group and bail as soon as we find one that's noncoherent. This - * means we only ever [un]register_noncoherent_dma once for the whole device. + * Groups/devices can use the same or different IOMMU domains. If the same + * then adding a new group/device may change the coherency of groups/devices + * we've previously been told about. We don't want to care about any of + * that so we retest each group/device and bail as soon as we find one that's + * noncoherent. This means we only ever [un]register_noncoherent_dma once + * for the whole device. */ static void kvm_vfio_update_coherency(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; bool noncoherent = false; - struct kvm_vfio_group *kvg; + struct kvm_vfio_file *kvf; - mutex_lock(&kv->lock); - - list_for_each_entry(kvg, &kv->group_list, node) { - if (!kvm_vfio_file_enforced_coherent(kvg->file)) { + list_for_each_entry(kvf, &kv->file_list, node) { + if (!kvm_vfio_file_enforced_coherent(kvf->file)) { noncoherent = true; break; } @@ -139,64 +138,58 @@ static void kvm_vfio_update_coherency(struct kvm_device *dev) else kvm_arch_unregister_noncoherent_dma(dev->kvm); } - - mutex_unlock(&kv->lock); } -static int kvm_vfio_group_add(struct kvm_device *dev, unsigned int fd) +static int kvm_vfio_file_add(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg; + struct kvm_vfio_file *kvf; struct file *filp; - int ret; + int ret = 0; filp = fget(fd); if (!filp) return -EBADF; - /* Ensure the FD is a vfio group FD.*/ - if (!kvm_vfio_file_is_group(filp)) { + /* Ensure the FD is a vfio FD. */ + if (!kvm_vfio_file_is_valid(filp)) { ret = -EINVAL; - goto err_fput; + goto out_fput; } mutex_lock(&kv->lock); - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->file == filp) { + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file == filp) { ret = -EEXIST; - goto err_unlock; + goto out_unlock; } } - kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT); - if (!kvg) { + kvf = kzalloc(sizeof(*kvf), GFP_KERNEL_ACCOUNT); + if (!kvf) { ret = -ENOMEM; - goto err_unlock; + goto out_unlock; } - kvg->file = filp; - list_add_tail(&kvg->node, &kv->group_list); + kvf->file = get_file(filp); + list_add_tail(&kvf->node, &kv->file_list); kvm_arch_start_assignment(dev->kvm); - - mutex_unlock(&kv->lock); - - kvm_vfio_file_set_kvm(kvg->file, dev->kvm); + kvm_vfio_file_set_kvm(kvf->file, dev->kvm); kvm_vfio_update_coherency(dev); - return 0; -err_unlock: +out_unlock: mutex_unlock(&kv->lock); -err_fput: +out_fput: fput(filp); return ret; } -static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) +static int kvm_vfio_file_del(struct kvm_device *dev, unsigned int fd) { struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg; + struct kvm_vfio_file *kvf; struct fd f; int ret; @@ -208,38 +201,38 @@ static int kvm_vfio_group_del(struct kvm_device *dev, unsigned int fd) mutex_lock(&kv->lock); - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->file != f.file) + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file != f.file) continue; - list_del(&kvg->node); + list_del(&kvf->node); kvm_arch_end_assignment(dev->kvm); #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_file_set_kvm(kvg->file, NULL); - fput(kvg->file); - kfree(kvg); + kvm_vfio_file_set_kvm(kvf->file, NULL); + fput(kvf->file); + kfree(kvf); ret = 0; break; } + kvm_vfio_update_coherency(dev); + mutex_unlock(&kv->lock); fdput(f); - kvm_vfio_update_coherency(dev); - return ret; } #ifdef CONFIG_SPAPR_TCE_IOMMU -static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, - void __user *arg) +static int kvm_vfio_file_set_spapr_tce(struct kvm_device *dev, + void __user *arg) { struct kvm_vfio_spapr_tce param; struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg; + struct kvm_vfio_file *kvf; struct fd f; int ret; @@ -254,20 +247,20 @@ static int kvm_vfio_group_set_spapr_tce(struct kvm_device *dev, mutex_lock(&kv->lock); - list_for_each_entry(kvg, &kv->group_list, node) { - if (kvg->file != f.file) + list_for_each_entry(kvf, &kv->file_list, node) { + if (kvf->file != f.file) continue; - if (!kvg->iommu_group) { - kvg->iommu_group = kvm_vfio_file_iommu_group(kvg->file); - if (WARN_ON_ONCE(!kvg->iommu_group)) { + if (!kvf->iommu_group) { + kvf->iommu_group = kvm_vfio_file_iommu_group(kvf->file); + if (WARN_ON_ONCE(!kvf->iommu_group)) { ret = -EIO; goto err_fdput; } } ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, param.tablefd, - kvg->iommu_group); + kvf->iommu_group); break; } @@ -278,26 +271,26 @@ err_fdput: } #endif -static int kvm_vfio_set_group(struct kvm_device *dev, long attr, - void __user *arg) +static int kvm_vfio_set_file(struct kvm_device *dev, long attr, + void __user *arg) { int32_t __user *argp = arg; int32_t fd; switch (attr) { - case KVM_DEV_VFIO_GROUP_ADD: + case KVM_DEV_VFIO_FILE_ADD: if (get_user(fd, argp)) return -EFAULT; - return kvm_vfio_group_add(dev, fd); + return kvm_vfio_file_add(dev, fd); - case KVM_DEV_VFIO_GROUP_DEL: + case KVM_DEV_VFIO_FILE_DEL: if (get_user(fd, argp)) return -EFAULT; - return kvm_vfio_group_del(dev, fd); + return kvm_vfio_file_del(dev, fd); #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: - return kvm_vfio_group_set_spapr_tce(dev, arg); + return kvm_vfio_file_set_spapr_tce(dev, arg); #endif } @@ -308,9 +301,9 @@ static int kvm_vfio_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: - return kvm_vfio_set_group(dev, attr->attr, - u64_to_user_ptr(attr->addr)); + case KVM_DEV_VFIO_FILE: + return kvm_vfio_set_file(dev, attr->attr, + u64_to_user_ptr(attr->addr)); } return -ENXIO; @@ -320,10 +313,10 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { switch (attr->group) { - case KVM_DEV_VFIO_GROUP: + case KVM_DEV_VFIO_FILE: switch (attr->attr) { - case KVM_DEV_VFIO_GROUP_ADD: - case KVM_DEV_VFIO_GROUP_DEL: + case KVM_DEV_VFIO_FILE_ADD: + case KVM_DEV_VFIO_FILE_DEL: #ifdef CONFIG_SPAPR_TCE_IOMMU case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: #endif @@ -339,16 +332,16 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, static void kvm_vfio_release(struct kvm_device *dev) { struct kvm_vfio *kv = dev->private; - struct kvm_vfio_group *kvg, *tmp; + struct kvm_vfio_file *kvf, *tmp; - list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { + list_for_each_entry_safe(kvf, tmp, &kv->file_list, node) { #ifdef CONFIG_SPAPR_TCE_IOMMU - kvm_spapr_tce_release_vfio_group(dev->kvm, kvg); + kvm_spapr_tce_release_vfio_group(dev->kvm, kvf); #endif - kvm_vfio_file_set_kvm(kvg->file, NULL); - fput(kvg->file); - list_del(&kvg->node); - kfree(kvg); + kvm_vfio_file_set_kvm(kvf->file, NULL); + fput(kvf->file); + list_del(&kvf->node); + kfree(kvf); kvm_arch_end_assignment(dev->kvm); } @@ -382,7 +375,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type) if (!kv) return -ENOMEM; - INIT_LIST_HEAD(&kv->group_list); + INIT_LIST_HEAD(&kv->file_list); mutex_init(&kv->lock); dev->private = kv; |