diff options
Diffstat (limited to 'drivers/vfio')
30 files changed, 893 insertions, 571 deletions
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c index 67465fad5b4b..5dd5f5ad7686 100644 --- a/drivers/vfio/cdx/main.c +++ b/drivers/vfio/cdx/main.c @@ -347,4 +347,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver"); -MODULE_IMPORT_NS(CDX_BUS); +MODULE_IMPORT_NS("CDX_BUS"); diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index bb1817bd4ff3..480cac3a0c27 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df) spin_unlock(&df->kvm_ref_lock); } +static int vfio_df_check_token(struct vfio_device *device, + const struct vfio_device_bind_iommufd *bind) +{ + uuid_t uuid; + + if (!device->ops->match_token_uuid) { + if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN) + return -EINVAL; + return 0; + } + + if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)) + return device->ops->match_token_uuid(device, NULL); + + if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr), + sizeof(uuid))) + return -EFAULT; + return device->ops->match_token_uuid(device, &uuid); +} + long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, struct vfio_device_bind_iommufd __user *arg) { + const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN; struct vfio_device *device = df->device; struct vfio_device_bind_iommufd bind; unsigned long minsz; + u32 user_size; int ret; static_assert(__same_type(arg->out_devid, df->devid)); minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid); - if (copy_from_user(&bind, arg, minsz)) - return -EFAULT; + ret = get_user(user_size, &arg->argsz); + if (ret) + return ret; + if (user_size < minsz) + return -EINVAL; + ret = copy_struct_from_user(&bind, minsz, arg, user_size); + if (ret) + return ret; - if (bind.argsz < minsz || bind.flags || bind.iommufd < 0) + if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS) return -EINVAL; /* BIND_IOMMUFD only allowed for cdev fds */ @@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df, goto out_unlock; } + ret = vfio_df_check_token(device, &bind); + if (ret) + goto out_unlock; + df->iommufd = iommufd_ctx_from_fd(bind.iommufd); if (IS_ERR(df->iommufd)) { ret = PTR_ERR(df->iommufd); @@ -162,9 +194,9 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df) int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, struct vfio_device_attach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_attach_iommufd_pt attach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; int ret; minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id); @@ -172,11 +204,34 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, if (copy_from_user(&attach, arg, minsz)) return -EFAULT; - if (attach.argsz < minsz || attach.flags) + if (attach.argsz < minsz) + return -EINVAL; + + if (attach.flags & ~VFIO_DEVICE_ATTACH_PASID) return -EINVAL; + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) { + if (!device->ops->pasid_attach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_attach_iommufd_pt, pasid); + } + + if (xend) { + if (attach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&attach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - ret = device->ops->attach_ioas(device, &attach.pt_id); + if (attach.flags & VFIO_DEVICE_ATTACH_PASID) + ret = device->ops->pasid_attach_ioas(device, + attach.pasid, + &attach.pt_id); + else + ret = device->ops->attach_ioas(device, &attach.pt_id); if (ret) goto out_unlock; @@ -198,20 +253,41 @@ out_unlock: int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, struct vfio_device_detach_iommufd_pt __user *arg) { - struct vfio_device *device = df->device; struct vfio_device_detach_iommufd_pt detach; - unsigned long minsz; + struct vfio_device *device = df->device; + unsigned long minsz, xend = 0; minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags); if (copy_from_user(&detach, arg, minsz)) return -EFAULT; - if (detach.argsz < minsz || detach.flags) + if (detach.argsz < minsz) + return -EINVAL; + + if (detach.flags & ~VFIO_DEVICE_DETACH_PASID) return -EINVAL; + if (detach.flags & VFIO_DEVICE_DETACH_PASID) { + if (!device->ops->pasid_detach_ioas) + return -EOPNOTSUPP; + xend = offsetofend(struct vfio_device_detach_iommufd_pt, pasid); + } + + if (xend) { + if (detach.argsz < xend) + return -EINVAL; + + if (copy_from_user((void *)&detach + minsz, + (void __user *)arg + minsz, xend - minsz)) + return -EFAULT; + } + mutex_lock(&device->dev_set->lock); - device->ops->detach_ioas(device); + if (detach.flags & VFIO_DEVICE_DETACH_PASID) + device->ops->pasid_detach_ioas(device, detach.pasid); + else + device->ops->detach_ioas(device); mutex_unlock(&device->dev_set->lock); return 0; diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 49559605177e..c376a6279de0 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -192,11 +192,10 @@ static int vfio_df_group_open(struct vfio_device_file *df) * implies they expected translation to exist */ if (!capable(CAP_SYS_RAWIO) || - vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) + vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) { ret = -EPERM; - else - ret = 0; - goto out_put_kvm; + goto out_put_kvm; + } } ret = vfio_df_open(df); @@ -266,24 +265,12 @@ static struct file *vfio_device_open_file(struct vfio_device *device) if (ret) goto err_free; - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - df, O_RDWR); + filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops, + df, O_RDWR, FMODE_PREAD | FMODE_PWRITE); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; } - - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - /* * Use the pseudo fs inode on the device to link all mmaps * to the same address space, allowing us to unmap all vmas diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c index 82eba6966fa5..a38d262c6028 100644 --- a/drivers/vfio/iommufd.c +++ b/drivers/vfio/iommufd.c @@ -7,8 +7,8 @@ #include "vfio.h" -MODULE_IMPORT_NS(IOMMUFD); -MODULE_IMPORT_NS(IOMMUFD_VFIO); +MODULE_IMPORT_NS("IOMMUFD"); +MODULE_IMPORT_NS("IOMMUFD_VFIO"); bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev, struct iommufd_ctx *ictx) @@ -25,6 +25,10 @@ int vfio_df_iommufd_bind(struct vfio_device_file *df) lockdep_assert_held(&vdev->dev_set->lock); + /* Returns 0 to permit device opening under noiommu mode */ + if (vfio_device_is_noiommu(vdev)) + return 0; + return vdev->ops->bind_iommufd(vdev, ictx, &df->devid); } @@ -119,16 +123,24 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev, if (IS_ERR(idev)) return PTR_ERR(idev); vdev->iommufd_device = idev; + ida_init(&vdev->pasids); return 0; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind); void vfio_iommufd_physical_unbind(struct vfio_device *vdev) { + int pasid; + lockdep_assert_held(&vdev->dev_set->lock); + while ((pasid = ida_find_first(&vdev->pasids)) >= 0) { + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); + } + if (vdev->iommufd_attached) { - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } iommufd_device_unbind(vdev->iommufd_device); @@ -146,9 +158,11 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id) return -EINVAL; if (vdev->iommufd_attached) - rc = iommufd_device_replace(vdev->iommufd_device, pt_id); + rc = iommufd_device_replace(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); else - rc = iommufd_device_attach(vdev->iommufd_device, pt_id); + rc = iommufd_device_attach(vdev->iommufd_device, + IOMMU_NO_PASID, pt_id); if (rc) return rc; vdev->iommufd_attached = true; @@ -163,11 +177,53 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev) if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached) return; - iommufd_device_detach(vdev->iommufd_device); + iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID); vdev->iommufd_attached = false; } EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas); +int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev, + u32 pasid, u32 *pt_id) +{ + int rc; + + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return -EINVAL; + + if (ida_exists(&vdev->pasids, pasid)) + return iommufd_device_replace(vdev->iommufd_device, + pasid, pt_id); + + rc = ida_alloc_range(&vdev->pasids, pasid, pasid, GFP_KERNEL); + if (rc < 0) + return rc; + + rc = iommufd_device_attach(vdev->iommufd_device, pasid, pt_id); + if (rc) + ida_free(&vdev->pasids, pasid); + + return rc; +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas); + +void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev, + u32 pasid) +{ + lockdep_assert_held(&vdev->dev_set->lock); + + if (WARN_ON(!vdev->iommufd_device)) + return; + + if (!ida_exists(&vdev->pasids, pasid)) + return; + + iommufd_device_detach(vdev->iommufd_device, pasid); + ida_free(&vdev->pasids, pasid); +} +EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas); + /* * The emulated standard ops mean that vfio_device is going to use the * "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index ed4737de4528..f2e686f8f1ef 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -76,7 +76,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev, if (ret) return ret; - ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL); + ret = class_compat_create_link(mdev_bus_compat_class, dev); if (ret) dev_warn(dev, "Failed to create compatibility class link\n"); @@ -98,7 +98,7 @@ void mdev_unregister_parent(struct mdev_parent *parent) dev_info(parent->dev, "MDEV: Unregistering\n"); down_write(&parent->unreg_sem); - class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL); + class_compat_remove_link(mdev_bus_compat_class, parent->dev); device_for_each_child(parent->dev, NULL, mdev_device_remove_cb); parent_remove_sysfs_files(parent); up_write(&parent->unreg_sem); diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index bf50ffa10bde..2b0172f54665 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -1,16 +1,12 @@ # SPDX-License-Identifier: GPL-2.0-only menu "VFIO support for PCI devices" - depends on PCI && MMU + depends on PCI config VFIO_PCI_CORE tristate select VFIO_VIRQFD select IRQ_BYPASS_MANAGER -config VFIO_PCI_MMAP - def_bool y if !S390 - depends on VFIO_PCI_CORE - config VFIO_PCI_INTX def_bool y if !S390 depends on VFIO_PCI_CORE diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 451c639299eb..397f5e445136 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -190,9 +190,10 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data) int ret; /* Check VF state */ - if (unlikely(hisi_qm_wait_mb_ready(qm))) { + ret = hisi_qm_wait_mb_ready(qm); + if (unlikely(ret)) { dev_err(&qm->pdev->dev, "QM device is not ready to write\n"); - return -EBUSY; + return ret; } ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1); @@ -325,13 +326,15 @@ static void qm_dev_cmd_init(struct hisi_qm *qm) static int vf_qm_cache_wb(struct hisi_qm *qm) { unsigned int val; + int ret; writel(0x1, qm->io_base + QM_CACHE_WB_START); - if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, + ret = readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE, val, val & BIT(0), MB_POLL_PERIOD_US, - MB_POLL_TIMEOUT_US)) { + MB_POLL_TIMEOUT_US); + if (ret) { dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n"); - return -EINVAL; + return ret; } return 0; @@ -350,6 +353,32 @@ static int vf_qm_func_stop(struct hisi_qm *qm) return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0); } +static int vf_qm_version_check(struct acc_vf_data *vf_data, struct device *dev) +{ + switch (vf_data->acc_magic) { + case ACC_DEV_MAGIC_V2: + if (vf_data->major_ver != ACC_DRV_MAJOR_VER) { + dev_info(dev, "migration driver version<%u.%u> not match!\n", + vf_data->major_ver, vf_data->minor_ver); + return -EINVAL; + } + break; + case ACC_DEV_MAGIC_V1: + /* Correct dma address */ + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; + vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; + break; + default: + return -EINVAL; + } + + return 0; +} + static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -363,9 +392,10 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done) return 0; - if (vf_data->acc_magic != ACC_DEV_MAGIC) { + ret = vf_qm_version_check(vf_data, dev); + if (ret) { dev_err(dev, "failed to match ACC_DEV_MAGIC\n"); - return -EINVAL; + return ret; } if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) { @@ -377,7 +407,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, ret = qm_get_vft(vf_qm, &vf_qm->qp_base); if (ret <= 0) { dev_err(dev, "failed to get vft qp nums\n"); - return -EINVAL; + return ret; } if (ret != vf_data->qp_num) { @@ -399,13 +429,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } - ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); - if (ret) { - dev_err(dev, "failed to write QM_VF_STATE\n"); - return ret; - } - - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; hisi_acc_vdev->match_done = true; return 0; } @@ -418,7 +441,9 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, int vf_id = hisi_acc_vdev->vf_id; int ret; - vf_data->acc_magic = ACC_DEV_MAGIC; + vf_data->acc_magic = ACC_DEV_MAGIC_V2; + vf_data->major_ver = ACC_DRV_MAJOR_VER; + vf_data->minor_ver = ACC_DRV_MINOR_VER; /* Save device id */ vf_data->dev_id = hisi_acc_vdev->vf_dev->device; @@ -441,6 +466,19 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } +static void vf_qm_xeqc_save(struct hisi_qm *qm, + struct hisi_acc_vf_migration_file *migf) +{ + struct acc_vf_data *vf_data = &migf->vf_data; + u16 eq_head, aeq_head; + + eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0); + + aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF; + qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0); +} + static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, struct hisi_acc_vf_migration_file *migf) { @@ -456,6 +494,20 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, if (migf->total_length < sizeof(struct acc_vf_data)) return -EINVAL; + if (!vf_data->eqe_dma || !vf_data->aeqe_dma || + !vf_data->sqc_dma || !vf_data->cqc_dma) { + dev_info(dev, "resume dma addr is NULL!\n"); + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; + return 0; + } + + ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1); + if (ret) { + dev_err(dev, "failed to write QM_VF_STATE\n"); + return ret; + } + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + qm->eqe_dma = vf_data->eqe_dma; qm->aeqe_dma = vf_data->aeqe_dma; qm->sqc_dma = vf_data->sqc_dma; @@ -493,27 +545,27 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) ret = qm_get_regs(vf_qm, vf_data); if (ret) - return -EINVAL; + return ret; /* Every reg is 32 bit, the dma address is 64 bit. */ - vf_data->eqe_dma = vf_data->qm_eqc_dw[1]; + vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH]; vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->eqe_dma |= vf_data->qm_eqc_dw[0]; - vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1]; + vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW]; + vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH]; vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET; - vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0]; + vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); if (ret) { dev_err(dev, "failed to read SQC addr!\n"); - return -EINVAL; + return ret; } ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma); if (ret) { dev_err(dev, "failed to read CQC addr!\n"); - return -EINVAL; + return ret; } return 0; @@ -524,7 +576,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, { struct acc_vf_data *vf_data = &migf->vf_data; struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; - struct device *dev = &vf_qm->pdev->dev; int ret; if (unlikely(qm_wait_dev_not_ready(vf_qm))) { @@ -538,17 +589,14 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, vf_data->vf_qm_state = QM_READY; hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - ret = vf_qm_cache_wb(vf_qm); - if (ret) { - dev_err(dev, "failed to writeback QM Cache!\n"); - return ret; - } - ret = vf_qm_read_data(vf_qm, vf_data); if (ret) - return -EINVAL; + return ret; migf->total_length = sizeof(struct acc_vf_data); + /* Save eqc and aeqc interrupt information */ + vf_qm_xeqc_save(vf_qm, migf); + return 0; } @@ -967,6 +1015,13 @@ static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev dev_err(dev, "failed to check QM INT state!\n"); return ret; } + + ret = vf_qm_cache_wb(vf_qm); + if (ret) { + dev_err(dev, "failed to writeback QM cache!\n"); + return ret; + } + return 0; } @@ -1327,7 +1382,7 @@ static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vde ret = qm_wait_dev_not_ready(vf_qm); if (ret) { seq_puts(seq, "VF device not ready!\n"); - return -EBUSY; + return ret; } return 0; @@ -1463,6 +1518,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + hisi_acc_vf_disable_fds(hisi_acc_vdev); mutex_lock(&hisi_acc_vdev->open_mutex); hisi_acc_vdev->dev_opened = false; iounmap(vf_qm->io_base); @@ -1485,6 +1541,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1; hisi_acc_vdev->pf_qm = pf_qm; hisi_acc_vdev->vf_dev = pdev; + hisi_acc_vdev->vf_qm_state = QM_NOT_READY; mutex_init(&hisi_acc_vdev->state_mutex); mutex_init(&hisi_acc_vdev->open_mutex); @@ -1526,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 245d7537b2bc..91002ceeebc1 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -39,6 +39,9 @@ #define QM_REG_ADDR_OFFSET 0x0004 #define QM_XQC_ADDR_OFFSET 32U +#define QM_XQC_ADDR_LOW 0x1 +#define QM_XQC_ADDR_HIGH 0x2 + #define QM_VF_AEQ_INT_MASK 0x0004 #define QM_VF_EQ_INT_MASK 0x000c #define QM_IFC_INT_SOURCE_V 0x0020 @@ -50,10 +53,15 @@ #define QM_EQC_DW0 0X8000 #define QM_AEQC_DW0 0X8020 +#define ACC_DRV_MAJOR_VER 1 +#define ACC_DRV_MINOR_VER 0 + +#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC +#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE + struct acc_vf_data { #define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state) /* QM match information */ -#define ACC_DEV_MAGIC 0XCDCDCDCDFEEDAACC u64 acc_magic; u32 qp_num; u32 dev_id; @@ -61,7 +69,9 @@ struct acc_vf_data { u32 qp_base; u32 vf_qm_state; /* QM reserved match information */ - u32 qm_rsv_state[3]; + u16 major_ver; + u16 minor_ver; + u32 qm_rsv_state[2]; /* QM RW regs */ u32 aeq_int_mask; diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 7527e277c898..a92b095b90f6 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -313,40 +313,21 @@ err_exec: return ret; } -static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vhca_data_buffer *buf, - struct mlx5_vhca_recv_buf *recv_buf, - u32 *mkey) +static u32 *alloc_mkey_in(u32 npages, u32 pdn) { - size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : - recv_buf->npages; - int err = 0, inlen; - __be64 *mtt; + int inlen; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*mtt) * round_up(npages, 2); + sizeof(__be64) * round_up(npages, 2); - in = kvzalloc(inlen, GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT); if (!in) - return -ENOMEM; + return NULL; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); - } else { - int i; - - for (i = 0; i < npages; i++) - *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); - } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -360,8 +341,81 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); - err = mlx5_core_create_mkey(mdev, mkey, in, inlen); - kvfree(in); + + return in; +} + +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in, + u32 *mkey) +{ + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +} + +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in, struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + __be64 *mtt; + int i; + + if (dma_use_iova(state)) { + dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir, + 0); + } else { + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, + klm_pas_mtt); + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir); + } + } +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in, + struct dma_iova_state *state, + enum dma_data_direction dir) +{ + dma_addr_t addr; + size_t mapped = 0; + __be64 *mtt; + int i, err; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) { + addr = state->addr; + for (i = 0; i < npages; i++) { + err = dma_iova_link(mdev->device, state, + page_to_phys(page_list[i]), mapped, + PAGE_SIZE, dir, 0); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + addr += PAGE_SIZE; + mapped += PAGE_SIZE; + } + err = dma_iova_sync(mdev->device, state, 0, mapped); + if (err) + goto error; + } else { + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, + PAGE_SIZE, dir); + err = dma_mapping_error(mdev->device, addr); + if (err) + goto error; + *mtt++ = cpu_to_be64(addr); + } + } + return 0; + +error: + unregister_dma_pages(mdev, i, mkey_in, state, dir); return err; } @@ -375,97 +429,97 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->allocated_length) + if (buf->mkey_in || !buf->npages) return -EINVAL; - ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); - if (ret) - return ret; + buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); + if (!buf->mkey_in) + return -ENOMEM; - ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); + ret = register_dma_pages(mdev, buf->npages, buf->page_list, + buf->mkey_in, &buf->state, buf->dma_dir); if (ret) - goto err; + goto err_register_dma; - buf->dmaed = true; + ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey); + if (ret) + goto err_create_mkey; return 0; -err: - dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); + +err_create_mkey: + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state, + buf->dma_dir); +err_register_dma: + kvfree(buf->mkey_in); + buf->mkey_in = NULL; return ret; } +static void free_page_list(u32 npages, struct page **page_list) +{ + int i; + + /* Undo alloc_pages_bulk() */ + for (i = npages - 1; i >= 0; i--) + __free_page(page_list[i]); + + kvfree(page_list); +} + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { - struct mlx5_vf_migration_file *migf = buf->migf; - struct sg_page_iter sg_iter; + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; - lockdep_assert_held(&migf->mvdev->state_mutex); - WARN_ON(migf->mvdev->mdev_detach); + lockdep_assert_held(&mvdev->state_mutex); + WARN_ON(mvdev->mdev_detach); - if (buf->dmaed) { - mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); - dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, - buf->dma_dir, 0); + if (buf->mkey_in) { + mlx5_core_destroy_mkey(mdev, buf->mkey); + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, + &buf->state, buf->dma_dir); + kvfree(buf->mkey_in); } - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&buf->table); + free_page_list(buf->npages, buf->page_list); kfree(buf); } -static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, - unsigned int npages) +static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages) { - unsigned int to_alloc = npages; - struct page **page_list; - unsigned long filled; - unsigned int to_fill; - int ret; + unsigned int filled, done = 0; int i; - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); - if (!page_list) + *page_list = + kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); + if (!*page_list) return -ENOMEM; - do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; + for (;;) { + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done, + *page_list + done); + if (!filled) goto err; - } - to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - if (ret) - goto err_append; - buf->allocated_length += filled * PAGE_SIZE; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); - } while (to_alloc > 0); + done += filled; + if (done == npages) + break; + } - kvfree(page_list); return 0; -err_append: - for (i = filled - 1; i >= 0; i--) - __free_page(page_list[i]); err: - kvfree(page_list); - return ret; + for (i = 0; i < done; i++) + __free_page(*page_list[i]); + + kvfree(*page_list); + *page_list = NULL; + return -ENOMEM; } struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; @@ -477,12 +531,13 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, buf->dma_dir = dma_dir; buf->migf = migf; - if (length) { - ret = mlx5vf_add_migration_pages(buf, - DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (npages) { + ret = mlx5vf_add_pages(&buf->page_list, npages); if (ret) goto end; + buf->npages = npages; + if (dma_dir != DMA_NONE) { ret = mlx5vf_dma_data_buffer(buf); if (ret) @@ -505,8 +560,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir) +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; @@ -521,7 +576,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); - if (buf->allocated_length >= length) { + if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); goto found; } @@ -535,7 +590,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, } } spin_unlock_irq(&migf->list_lock); - buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, @@ -716,7 +771,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); - MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); @@ -738,8 +793,11 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + header_buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; @@ -783,7 +841,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev, if (mvdev->mdev_detach) return -ENOTCONN; - if (!buf->dmaed) { + if (!buf->mkey_in) { err = mlx5vf_dma_data_buffer(buf); if (err) return err; @@ -1338,103 +1396,16 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, kfree(qp); } -static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - /* Undo alloc_pages_bulk_array() */ - for (i = 0; i < recv_buf->npages; i++) - __free_page(recv_buf->page_list[i]); - - kvfree(recv_buf->page_list); -} - -static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, - unsigned int npages) -{ - unsigned int filled = 0, done = 0; - int i; - - recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->page_list) - return -ENOMEM; - - for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, - npages - done, - recv_buf->page_list + done); - if (!filled) - goto err; - - done += filled; - if (done == npages) - break; - } - - recv_buf->npages = npages; - return 0; - -err: - for (i = 0; i < npages; i++) { - if (recv_buf->page_list[i]) - __free_page(recv_buf->page_list[i]); - } - - kvfree(recv_buf->page_list); - return -ENOMEM; -} - -static int register_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) -{ - int i, j; - - recv_buf->dma_addrs = kvcalloc(recv_buf->npages, - sizeof(*recv_buf->dma_addrs), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->dma_addrs) - return -ENOMEM; - - for (i = 0; i < recv_buf->npages; i++) { - recv_buf->dma_addrs[i] = dma_map_page(mdev->device, - recv_buf->page_list[i], - 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) - goto error; - } - return 0; - -error: - for (j = 0; j < i; j++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); - return -ENOMEM; -} - -static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - for (i = 0; i < recv_buf->npages; i++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_recv_pages(mdev, recv_buf); - free_recv_pages(&qp->recv_buf); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in, + &recv_buf->state, DMA_FROM_DEVICE); + kvfree(recv_buf->mkey_in); + free_page_list(recv_buf->npages, recv_buf->page_list); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1445,24 +1416,38 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; int err; - err = alloc_recv_pages(recv_buf, npages); - if (err < 0) + err = mlx5vf_add_pages(&recv_buf->page_list, npages); + if (err) return err; - err = register_dma_recv_pages(mdev, recv_buf); - if (err) + recv_buf->npages = npages; + + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); + if (!recv_buf->mkey_in) { + err = -ENOMEM; goto end; + } + + err = register_dma_pages(mdev, npages, recv_buf->page_list, + recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); + if (err) + goto err_register_dma; - err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state, + DMA_FROM_DEVICE); +err_register_dma: + kvfree(recv_buf->mkey_in); + recv_buf->mkey_in = NULL; end: - free_recv_pages(recv_buf); + free_page_list(npages, recv_buf->page_list); return err; } @@ -1517,7 +1502,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, struct mlx5_vhca_qp *host_qp; struct mlx5_vhca_qp *fw_qp; struct mlx5_core_dev *mdev; - u32 max_msg_size = PAGE_SIZE; + u32 log_max_msg_size; + u32 max_msg_size; u64 rq_size = SZ_2M; u32 max_recv_wr; int err; @@ -1534,6 +1520,12 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev, } mdev = mvdev->mdev; + log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size); + max_msg_size = (1ULL << log_max_msg_size); + /* The RQ must hold at least 4 WQEs/messages for successful QP creation */ + if (rq_size < 4ULL * max_msg_size) + rq_size = 4ULL * max_msg_size; + memset(tracker, 0, sizeof(*tracker)); tracker->uar = mlx5_get_uars_page(mdev); if (IS_ERR(tracker->uar)) { @@ -1623,25 +1615,41 @@ set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp, { u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry); u32 nent = size / entry_size; + u32 nent_in_page; + u32 nent_to_set; struct page *page; + u32 page_offset; + u32 page_index; + u32 buf_offset; + void *kaddr; u64 addr; u64 *buf; int i; - if (WARN_ON(index >= qp->recv_buf.npages || + buf_offset = index * qp->max_msg_size; + if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE || (nent > qp->max_msg_size / entry_size))) return; - page = qp->recv_buf.page_list[index]; - buf = kmap_local_page(page); - for (i = 0; i < nent; i++) { - addr = MLX5_GET(page_track_report_entry, buf + i, - dirty_address_low); - addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, - dirty_address_high) << 32; - iova_bitmap_set(dirty, addr, qp->tracked_page_size); - } - kunmap_local(buf); + do { + page_index = buf_offset / PAGE_SIZE; + page_offset = buf_offset % PAGE_SIZE; + nent_in_page = (PAGE_SIZE - page_offset) / entry_size; + page = qp->recv_buf.page_list[page_index]; + kaddr = kmap_local_page(page); + buf = kaddr + page_offset; + nent_to_set = min(nent, nent_in_page); + for (i = 0; i < nent_to_set; i++) { + addr = MLX5_GET(page_track_report_entry, buf + i, + dirty_address_low); + addr |= (u64)MLX5_GET(page_track_report_entry, buf + i, + dirty_address_high) << 32; + iova_bitmap_set(dirty, addr, qp->tracked_page_size); + } + kunmap_local(kaddr); + buf_offset += (nent_to_set * entry_size); + nent -= nent_to_set; + } while (nent); } static void diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index df421dc6de04..d7821b5ca772 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -53,20 +53,17 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { - struct sg_append_table table; + struct page **page_list; + struct dma_iova_state state; loff_t start_pos; u64 length; - u64 allocated_length; + u32 npages; u32 mkey; + u32 *mkey_in; enum dma_data_direction dma_dir; - u8 dmaed:1; u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; }; struct mlx5vf_async_data { @@ -133,8 +130,9 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; - dma_addr_t *dma_addrs; + struct dma_iova_state state; u32 next_rq_offset; + u32 *mkey_in; u32 mkey; }; @@ -217,15 +215,24 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); -struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset); +static inline struct page * +mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset) +{ + int page_entry = offset / PAGE_SIZE; + + if (page_entry >= buf->npages) + return NULL; + + return buf->page_list[page_entry]; +} void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev); void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, enum mlx5_vf_migf_state *last_save_state); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 8833e60d42f5..7ec47e736a8e 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -34,37 +34,6 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -struct page * -mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset) -{ - unsigned long cur_offset = 0; - struct scatterlist *sg; - unsigned int i; - - /* All accesses are sequential */ - if (offset < buf->last_offset || !buf->last_offset_sg) { - buf->last_offset = 0; - buf->last_offset_sg = buf->table.sgt.sgl; - buf->sg_last_entry = 0; - } - - cur_offset = buf->last_offset; - - for_each_sg(buf->last_offset_sg, sg, - buf->table.sgt.orig_nents - buf->sg_last_entry, i) { - if (offset < sg->length + cur_offset) { - buf->last_offset_sg = sg; - buf->sg_last_entry += i; - buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); - } - cur_offset += sg->length; - } - return NULL; -} - static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) { mutex_lock(&migf->lock); @@ -308,6 +277,7 @@ static struct mlx5_vhca_data_buffer * mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, u8 index, size_t required_length) { + u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); struct mlx5_vhca_data_buffer *buf = migf->buf[index]; u8 chunk_num; @@ -315,12 +285,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, chunk_num = buf->stop_copy_chunk_num; buf->migf->buf[index] = NULL; /* Checking whether the pre-allocated buffer can fit */ - if (buf->allocated_length >= required_length) + if (buf->npages >= npages) return buf; mlx5vf_put_data_buffer(buf); - buf = mlx5vf_get_data_buffer(buf->migf, required_length, - DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); if (IS_ERR(buf)) return buf; @@ -373,7 +342,8 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, u8 *to_buff; int ret; - header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) return PTR_ERR(header_buf); @@ -388,7 +358,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -437,15 +407,20 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) { - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer( + migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } migf->buf[i] = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; @@ -553,7 +528,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for. */ - buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); mlx5vf_mark_err(migf); @@ -675,8 +651,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (track) { /* leave the allocated buffer ready for the stop-copy phase */ - buf = mlx5vf_alloc_data_buffer(migf, - migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_pd; @@ -917,11 +893,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: - if (vhca_buf_header->allocated_length < migf->record_size) { + { + u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); + + if (vhca_buf_header->npages < npages) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, - migf->record_size, DMA_NONE); + migf->buf_header[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) { ret = PTR_ERR(migf->buf_header[0]); migf->buf_header[0] = NULL; @@ -934,6 +913,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, vhca_buf_header->start_pos = migf->max_pos; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break; + } case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, &buf, &len, pos, &done); @@ -944,12 +924,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, { u64 size = max(migf->record_size, migf->stop_copy_prep_size); + u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); - if (vhca_buf->allocated_length < size) { + if (vhca_buf->npages < npages) { mlx5vf_free_data_buffer(vhca_buf); - migf->buf[0] = mlx5vf_alloc_data_buffer(migf, - size, DMA_TO_DEVICE); + migf->buf[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) { ret = PTR_ERR(migf->buf[0]); migf->buf[0] = NULL; @@ -1037,8 +1018,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) } migf->buf[0] = buf; - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_alloc_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_buf; @@ -1148,7 +1132,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); if (ret) return ERR_PTR(ret); - buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, + DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE); if (IS_ERR(buf)) return ERR_CAST(buf); /* pre_copy cleanup */ @@ -1387,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -1446,7 +1432,7 @@ static struct pci_driver mlx5vf_pci_driver = { module_pci_driver(mlx5vf_pci_driver); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a467085038f0..d95761dcdd58 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -5,6 +5,8 @@ #include <linux/sizes.h> #include <linux/vfio_pci_core.h> +#include <linux/delay.h> +#include <linux/jiffies.h> /* * The device memory usable to the workloads running in the VM is cached @@ -17,12 +19,21 @@ #define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX #define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX -/* Memory size expected as non cached and reserved by the VM driver */ -#define RESMEM_SIZE SZ_1G - /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */ #define MEMBLK_SIZE SZ_512M +#define DVSEC_BITMAP_OFFSET 0xA +#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0) + +#define GPU_CAP_DVSEC_REGISTER 3 + +#define C2C_LINK_BAR0_OFFSET 0x1498 +#define HBM_TRAINING_BAR0_OFFSET 0x200BC +#define STATUS_READY 0xFF + +#define POLL_QUANTUM_MS 1000 +#define POLL_TIMEOUT_MS (30 * 1000) + /* * The state of the two device memory region - resmem and usemem - is * saved as struct mem_region. @@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + bool has_mig_hw_bug; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index, if (index == USEMEM_REGION_INDEX) return &nvdev->usemem; - if (index == RESMEM_REGION_INDEX) + if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX) return &nvdev->resmem; return NULL; @@ -684,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = { .mmap = nvgrace_gpu_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -703,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -751,40 +765,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, u64 memphys, u64 memlength) { int ret = 0; + u64 resmem_size = 0; /* - * The VM GPU device driver needs a non-cacheable region to support - * the MIG feature. Since the device memory is mapped as NORMAL cached, - * carve out a region from the end with a different NORMAL_NC - * property (called as reserved memory and represented as resmem). This - * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while - * exposing the rest (termed as usable memory and represented using usemem) - * as cacheable 64b BAR (region 4 and 5). + * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable + * region to support the MIG feature owing to a hardware bug. Since the + * device memory is mapped as NORMAL cached, carve out a region from the end + * with a different NORMAL_NC property (called as reserved memory and + * represented as resmem). This region then is exposed as a 64b BAR + * (region 2 and 3) to the VM, while exposing the rest (termed as usable + * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5). * * devmem (memlength) * |-------------------------------------------------| * | | * usemem.memphys resmem.memphys + * + * This hardware bug is fixed on the Grace Blackwell platforms and the + * presence of the bug can be determined through nvdev->has_mig_hw_bug. + * Thus on systems with the hardware fix, there is no need to partition + * the GPU device memory and the entire memory is usable and mapped as + * NORMAL cached (i.e. resmem size is 0). */ + if (nvdev->has_mig_hw_bug) + resmem_size = SZ_1G; + nvdev->usemem.memphys = memphys; /* * The device memory exposed to the VM is added to the kernel by the - * VM driver module in chunks of memory block size. Only the usable - * memory (usemem) is added to the kernel for usage by the VM - * workloads. Make the usable memory size memblock aligned. + * VM driver module in chunks of memory block size. Note that only the + * usable memory (usemem) is added to the kernel for usage by the VM + * workloads. */ - if (check_sub_overflow(memlength, RESMEM_SIZE, + if (check_sub_overflow(memlength, resmem_size, &nvdev->usemem.memlength)) { ret = -EOVERFLOW; goto done; } /* - * The USEMEM part of the device memory has to be MEMBLK_SIZE - * aligned. This is a hardwired ABI value between the GPU FW and - * VFIO driver. The VM device driver is also aware of it and make - * use of the value for its calculation to determine USEMEM size. + * The usemem region is exposed as a 64B Bar composed of region 4 and 5. + * Calculate and save the BAR size for the region. + */ + nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); + + /* + * If the hardware has the fix for MIG, there is no requirement + * for splitting the device memory to create RESMEM. The entire + * device memory is usable and will be USEMEM. Return here for + * such case. + */ + if (!nvdev->has_mig_hw_bug) + goto done; + + /* + * When the device memory is split to workaround the MIG bug on + * Grace Hopper, the USEMEM part of the device memory has to be + * MEMBLK_SIZE aligned. This is a hardwired ABI value between the + * GPU FW and VFIO driver. The VM device driver is also aware of it + * and make use of the value for its calculation to determine USEMEM + * size. Note that the device memory may not be 512M aligned. */ nvdev->usemem.memlength = round_down(nvdev->usemem.memlength, MEMBLK_SIZE); @@ -803,15 +844,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev, } /* - * The memory regions are exposed as BARs. Calculate and save - * the BAR size for them. + * The resmem region is exposed as a 64b BAR composed of region 2 and 3 + * for Grace Hopper. Calculate and save the BAR size for the region. */ - nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength); nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength); done: return ret; } +static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) +{ + int pcie_dvsec; + u16 dvsec_ctrl16; + + pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA, + GPU_CAP_DVSEC_REGISTER); + + if (pcie_dvsec) { + pci_read_config_word(pdev, + pcie_dvsec + DVSEC_BITMAP_OFFSET, + &dvsec_ctrl16); + + if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM) + return false; + } + + return true; +} + +/* + * To reduce the system bootup time, the HBM training has + * been moved out of the UEFI on the Grace-Blackwell systems. + * + * The onus of checking whether the HBM training has completed + * thus falls on the module. The HBM training status can be + * determined from a BAR0 register. + * + * Similarly, another BAR0 register exposes the status of the + * CPU-GPU chip-to-chip (C2C) cache coherent interconnect. + * + * Poll these register and check for 30s. If the HBM training is + * not complete or if the C2C link is not ready, fail the probe. + * + * While the wait is not required on Grace Hopper systems, it + * is beneficial to make the check to ensure the device is in an + * expected state. + * + * Ensure that the BAR0 region is enabled before accessing the + * registers. + */ +static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev) +{ + unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); + void __iomem *io; + int ret = -ETIME; + + ret = pci_enable_device(pdev); + if (ret) + return ret; + + ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME); + if (ret) + goto request_region_exit; + + io = pci_iomap(pdev, 0, 0); + if (!io) { + ret = -ENOMEM; + goto iomap_exit; + } + + do { + if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && + (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) { + ret = 0; + goto reg_check_exit; + } + msleep(POLL_QUANTUM_MS); + } while (!time_after(jiffies, timeout)); + +reg_check_exit: + pci_iounmap(pdev, io); +iomap_exit: + pci_release_selected_regions(pdev, 1 << 0); +request_region_exit: + pci_disable_device(pdev); + return ret; +} + static int nvgrace_gpu_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -820,6 +939,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; + ret = nvgrace_gpu_wait_device_ready(pdev); + if (ret) + return ret; + ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; @@ -832,6 +955,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { + nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev); + /* * Device memory properties are identified in the host ACPI * table. Set the nvgrace_gpu_pci_core_device structure. @@ -868,6 +993,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, /* GH200 SKU */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, + /* GB200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) }, {} }; diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c index 16e93b11ab1b..4923f1823126 100644 --- a/drivers/vfio/pci/pds/pci_drv.c +++ b/drivers/vfio/pci/pds/pci_drv.c @@ -187,7 +187,7 @@ static struct pci_driver pds_vfio_pci_driver = { module_pci_driver(pds_vfio_pci_driver); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION); MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>"); MODULE_LICENSE("GPL"); diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c index 76a80ae7087b..f3ccb0008f67 100644 --- a/drivers/vfio/pci/pds/vfio_dev.c +++ b/drivers/vfio/pci/pds/vfio_dev.c @@ -201,9 +201,11 @@ static const struct vfio_device_ops pds_vfio_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, }; const struct vfio_device_ops *pds_vfio_ops_info(void) diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index c78cb6de9390..a19b68043eb2 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -675,6 +676,8 @@ static const struct pci_device_id qat_vf_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) }, { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) }, { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) }, + /* Intel QAT GEN6 6xxx VF device */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4949) }, {} }; MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table); @@ -696,5 +699,5 @@ module_pci_driver(qat_vf_vfio_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Xin Zeng <xin.zeng@intel.com>"); -MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family"); -MODULE_IMPORT_NS(CRYPTO_QAT); +MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT device family"); +MODULE_IMPORT_NS("CRYPTO_QAT"); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index e727941f589d..ac10f14417f2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -111,9 +111,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev) if (ret) return ret; - if (vfio_pci_is_vga(pdev) && - pdev->vendor == PCI_VENDOR_ID_INTEL && - IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { + if (vfio_pci_is_intel_display(pdev)) { ret = vfio_pci_igd_init(vdev); if (ret && ret != -ENODEV) { pci_warn(pdev, "Failed to setup Intel IGD regions\n"); @@ -140,10 +138,13 @@ static const struct vfio_device_ops vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, + .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas, + .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index ea2745c1ac5e..8f02f236b5b4 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev) mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) { - mask = ~(0x20000 - 1); + } else if (pdev->rom && pdev->romlen) { + mask = ~(roundup_pow_of_two(pdev->romlen) - 1); mask |= PCI_ROM_ADDRESS_ENABLE; *vbar &= cpu_to_le32((u32)mask); - } else + } else { *vbar = 0; + } vdev->bardirty = false; } @@ -1389,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo switch (ecap) { case PCI_EXT_CAP_ID_VNDR: - ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword); + ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER, + &dword); if (ret) return pcibios_err_to_errno(ret); - return dword >> PCI_VSEC_HDR_LEN_SHIFT; + return PCI_VNDR_HEADER_LEN(dword); case PCI_EXT_CAP_ID_VC: case PCI_EXT_CAP_ID_VC9: case PCI_EXT_CAP_ID_MFVC: @@ -1813,7 +1814,8 @@ int vfio_config_init(struct vfio_pci_core_device *vdev) cpu_to_le16(PCI_COMMAND_MEMORY); } - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx || + !vdev->pdev->irq || vdev->pdev->irq == IRQ_NOTCONNECTED) vconfig[PCI_INTERRUPT_PIN] = 0; ret = vfio_cap_init(vdev); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 1ab58da9f38a..7dcf5439dedc 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -20,7 +20,6 @@ #include <linux/mutex.h> #include <linux/notifier.h> #include <linux/pci.h> -#include <linux/pfn_t.h> #include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/types.h> @@ -116,7 +115,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) res = &vdev->pdev->resource[bar]; - if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP)) + if (vdev->pdev->non_mappable_bars) goto no_mmap; if (!(res->flags & IORESOURCE_MEM)) @@ -727,15 +726,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) { if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { - u8 pin; - - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || - vdev->nointx || vdev->pdev->is_virtfn) - return 0; - - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); - - return pin ? 1 : 0; + return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0; } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { u8 pos; u16 flags; @@ -1054,31 +1045,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev, info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); info.flags = 0; + info.size = 0; - /* Report the BAR size, not the ROM size */ - info.size = pci_resource_len(pdev, info.index); - if (!info.size) { - /* Shadow ROMs appear as PCI option ROMs */ - if (pdev->resource[PCI_ROM_RESOURCE].flags & - IORESOURCE_ROM_SHADOW) - info.size = 0x20000; - else - break; - } - - /* - * Is it really there? Enable memory decode for implicit access - * in pci_map_rom(). - */ - cmd = vfio_pci_memory_lock_and_enable(vdev); - io = pci_map_rom(pdev, &size); - if (io) { + if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) { + /* + * Check ROM content is valid. Need to enable memory + * decode for ROM access in pci_map_rom(). + */ + cmd = vfio_pci_memory_lock_and_enable(vdev); + io = pci_map_rom(pdev, &size); + if (io) { + info.flags = VFIO_REGION_INFO_FLAG_READ; + /* Report the BAR size, not the ROM size. */ + info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE); + pci_unmap_rom(pdev, io); + } + vfio_pci_memory_unlock_and_restore(vdev, cmd); + } else if (pdev->rom && pdev->romlen) { info.flags = VFIO_REGION_INFO_FLAG_READ; - pci_unmap_rom(pdev, io); - } else { - info.size = 0; + /* Report BAR size as power of two. */ + info.size = roundup_pow_of_two(pdev->romlen); } - vfio_pci_memory_unlock_and_restore(vdev, cmd); break; } @@ -1658,17 +1645,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, { struct vm_area_struct *vma = vmf->vma; struct vfio_pci_core_device *vdev = vma->vm_private_data; - unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff; + unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1); + unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; + unsigned long pfn = vma_to_pfn(vma) + pgoff; vm_fault_t ret = VM_FAULT_SIGBUS; - if (order && (vmf->address & ((PAGE_SIZE << order) - 1) || - vmf->address + (PAGE_SIZE << order) > vma->vm_end)) { + if (order && (addr < vma->vm_start || + addr + (PAGE_SIZE << order) > vma->vm_end || + pfn & ((1 << order) - 1))) { ret = VM_FAULT_FALLBACK; goto out; } - pfn = vma_to_pfn(vma); - down_read(&vdev->memory_lock); if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) @@ -1676,18 +1664,16 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf, switch (order) { case 0: - ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff); + ret = vmf_insert_pfn(vma, vmf->address, pfn); break; #ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP case PMD_ORDER: - ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pmd(vmf, pfn, false); break; #endif #ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP case PUD_ORDER: - ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff, - PFN_DEV), false); + ret = vmf_insert_pfn_pud(vmf, pfn, false); break; #endif default: @@ -1832,9 +1818,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count) } EXPORT_SYMBOL_GPL(vfio_pci_core_request); -static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, - bool vf_token, uuid_t *uuid) +int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, + const uuid_t *uuid) + { + struct vfio_pci_core_device *vdev = + container_of(core_vdev, struct vfio_pci_core_device, vdev); + /* * There's always some degree of trust or collaboration between SR-IOV * PF and VFs, even if just that the PF hosts the SR-IOV capability and @@ -1865,7 +1855,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, bool match; if (!pf_vdev) { - if (!vf_token) + if (!uuid) return 0; /* PF is not vfio-pci, no VF token */ pci_info_ratelimited(vdev->pdev, @@ -1873,7 +1863,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return -EINVAL; } - if (!vf_token) { + if (!uuid) { pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1891,7 +1881,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, } else if (vdev->vf_token) { mutex_lock(&vdev->vf_token->lock); if (vdev->vf_token->users) { - if (!vf_token) { + if (!uuid) { mutex_unlock(&vdev->vf_token->lock); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); @@ -1904,12 +1894,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, "Incorrect VF token provided for device\n"); return -EACCES; } - } else if (vf_token) { + } else if (uuid) { uuid_copy(&vdev->vf_token->uuid, uuid); } mutex_unlock(&vdev->vf_token->lock); - } else if (vf_token) { + } else if (uuid) { pci_info_ratelimited(vdev->pdev, "VF token incorrectly provided, not a PF or VF\n"); return -EINVAL; @@ -1917,6 +1907,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev, return 0; } +EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid); #define VF_TOKEN_ARG "vf_token=" @@ -1963,7 +1954,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf) } } - ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid); + ret = core_vdev->ops->match_token_uuid(core_vdev, + vf_token ? &uuid : NULL); if (ret) return ret; @@ -2160,7 +2152,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev) return -EBUSY; } - if (pci_is_root_bus(pdev->bus)) { + if (pci_is_root_bus(pdev->bus) || pdev->is_virtfn) { ret = vfio_assign_device_set(&vdev->vdev, vdev); } else if (!pci_probe_reset_slot(pdev->slot)) { ret = vfio_assign_device_set(&vdev->vdev, pdev->slot); diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index dd70e2431bd7..988b6919c2c3 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -435,6 +435,11 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev) return 0; } +bool vfio_pci_is_intel_display(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_INTEL) && pci_is_display(pdev); +} + int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) { int ret; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 8382c5834335..123298a4dc8f 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -259,7 +259,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, if (!is_irq_none(vdev)) return -EINVAL; - if (!pdev->irq) + if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED) return -ENODEV; name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); @@ -505,15 +505,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (ret) goto out_put_eventfd_ctx; - ctx->producer.token = trigger; - ctx->producer.irq = irq; - ret = irq_bypass_register_producer(&ctx->producer); + ret = irq_bypass_register_producer(&ctx->producer, trigger, irq); if (unlikely(ret)) { dev_info(&pdev->dev, - "irq bypass producer (token %p) registration fails: %d\n", - ctx->producer.token, ret); - - ctx->producer.token = NULL; + "irq bypass producer (eventfd %p) registration fails: %d\n", + trigger, ret); } ctx->trigger = trigger; diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h index 5e4fa69aee16..a9972eacb293 100644 --- a/drivers/vfio/pci/vfio_pci_priv.h +++ b/drivers/vfio/pci/vfio_pci_priv.h @@ -67,8 +67,14 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 cmd); #ifdef CONFIG_VFIO_PCI_IGD +bool vfio_pci_is_intel_display(struct pci_dev *pdev); int vfio_pci_igd_init(struct vfio_pci_core_device *vdev); #else +static inline bool vfio_pci_is_intel_display(struct pci_dev *pdev) +{ + return false; +} + static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev) { return -ENODEV; diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 66b72c289284..6192788c8ba3 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -16,6 +16,7 @@ #include <linux/io.h> #include <linux/vfio.h> #include <linux/vgaarb.h> +#include <linux/io-64-nonatomic-lo-hi.h> #include "vfio_pci_priv.h" @@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size); VFIO_IOWRITE(8) VFIO_IOWRITE(16) VFIO_IOWRITE(32) -#ifdef iowrite64 VFIO_IOWRITE(64) -#endif #define VFIO_IOREAD(size) \ int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \ @@ -89,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size); VFIO_IOREAD(8) VFIO_IOREAD(16) VFIO_IOREAD(32) -#ifdef ioread64 VFIO_IOREAD(64) -#endif #define VFIO_IORDWR(size) \ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ @@ -127,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\ VFIO_IORDWR(8) VFIO_IORDWR(16) VFIO_IORDWR(32) -#if defined(ioread64) && defined(iowrite64) VFIO_IORDWR(64) -#endif /* * Read or write from an __iomem region (MMIO or I/O port) with an excluded @@ -155,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, else fillable = 0; -#if defined(ioread64) && defined(iowrite64) if (fillable >= 8 && !(off % 8)) { ret = vfio_pci_iordwr64(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -163,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, return ret; } else -#endif if (fillable >= 4 && !(off % 4)) { ret = vfio_pci_iordwr32(vdev, iswrite, test_mem, io, buf, off, &filled); @@ -244,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (pci_resource_start(pdev, bar)) end = pci_resource_len(pdev, bar); - else if (bar == PCI_ROM_RESOURCE && - pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW) - end = 0x20000; + else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen) + end = roundup_pow_of_two(pdev->romlen); else return -EINVAL; @@ -261,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, * excluded range at the end of the actual ROM. This makes * filling large ROM BARs much faster. */ - io = pci_map_rom(pdev, &x_start); - if (!io) { - done = -ENOMEM; - goto out; + if (pci_resource_start(pdev, bar)) { + io = pci_map_rom(pdev, &x_start); + } else { + io = ioremap(pdev->rom, pdev->romlen); + x_start = pdev->romlen; } + if (!io) + return -ENOMEM; x_end = end; } else { int ret = vfio_pci_core_setup_barmap(vdev, bar); @@ -288,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, if (done >= 0) *ppos += done; - if (bar == PCI_ROM_RESOURCE) - pci_unmap_rom(pdev, io); + if (bar == PCI_ROM_RESOURCE) { + if (pci_resource_start(pdev, bar)) + pci_unmap_rom(pdev, io); + else + iounmap(io); + } + out: return done; } @@ -381,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd, vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#ifdef iowrite64 case 8: vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem, ioeventfd->data, ioeventfd->addr); break; -#endif } } @@ -440,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, pos >= vdev->msix_offset + vdev->msix_size)) return -EINVAL; -#ifndef iowrite64 if (count == 8) return -EINVAL; -#endif ret = vfio_pci_core_setup_barmap(vdev, bar); if (ret) diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig index 2770f7eb702c..33e04e65bec6 100644 --- a/drivers/vfio/pci/virtio/Kconfig +++ b/drivers/vfio/pci/virtio/Kconfig @@ -1,11 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only config VIRTIO_VFIO_PCI - tristate "VFIO support for VIRTIO NET PCI VF devices" + tristate "VFIO support for VIRTIO PCI VF devices" depends on VIRTIO_PCI select VFIO_PCI_CORE help - This provides migration support for VIRTIO NET PCI VF devices - using the VFIO framework. Migration support requires the + This provides migration support for VIRTIO NET and BLOCK PCI VF + devices using the VFIO framework. Migration support requires the SR-IOV PF device to support specific VIRTIO extensions, otherwise this driver provides no additional functionality beyond vfio-pci. diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index 20382ee15fac..832af5ba267c 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -382,7 +382,9 @@ static bool virtiovf_bar0_exists(struct pci_dev *pdev) bool virtiovf_support_legacy_io(struct pci_dev *pdev) { - return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev); + /* For now, the legacy IO functionality is supported only for virtio-net */ + return pdev->device == 0x1041 && virtio_pci_admin_has_legacy_io(pdev) && + !virtiovf_bar0_exists(pdev); } int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev) diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index d534d48c4163..8084f3e36a9f 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .mmap = vfio_pci_core_mmap, .request = vfio_pci_core_request, .match = vfio_pci_core_match, + .match_token_uuid = vfio_pci_core_match_token_uuid, .bind_iommufd = vfio_iommufd_physical_bind, .unbind_iommufd = vfio_iommufd_physical_unbind, .attach_ioas = vfio_iommufd_physical_attach_ioas, @@ -187,8 +190,9 @@ static void virtiovf_pci_remove(struct pci_dev *pdev) } static const struct pci_device_id virtiovf_pci_table[] = { - /* Only virtio-net is supported/tested so far */ + /* Only virtio-net and virtio-block are supported/tested so far */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) }, + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042) }, {} }; @@ -221,4 +225,4 @@ module_pci_driver(virtiovf_pci_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); MODULE_DESCRIPTION( - "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices"); + "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET and BLOCK devices"); diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c index ee54f4c17857..ba92bb4e9af9 100644 --- a/drivers/vfio/pci/virtio/migrate.c +++ b/drivers/vfio/pci/virtio/migrate.c @@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); + filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) { struct sg_page_iter sg_iter; - /* Undo alloc_pages_bulk_array() */ + /* Undo alloc_pages_bulk() */ for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c index 42d1462c5e19..512533501eb7 100644 --- a/drivers/vfio/platform/vfio_platform.c +++ b/drivers/vfio/platform/vfio_platform.c @@ -112,7 +112,7 @@ static const struct vfio_device_ops vfio_platform_ops = { static struct platform_driver vfio_platform_driver = { .probe = vfio_platform_probe, - .remove_new = vfio_platform_remove, + .remove = vfio_platform_remove, .driver = { .name = "vfio-platform", }, diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index e53757d1d095..3bf1043cd795 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); @@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 50ebc9593c9d..f8d68fe77b41 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -80,7 +80,6 @@ struct vfio_domain { struct iommu_domain *domain; struct list_head next; struct list_head group_list; - bool fgsp : 1; /* Fine-grained super pages */ bool enforce_cache_coherency : 1; }; @@ -103,9 +102,9 @@ struct vfio_dma { struct vfio_batch { struct page **pages; /* for pin_user_pages_remote */ struct page *fallback_page; /* if pages alloc fails */ - int capacity; /* length of pages array */ - int size; /* of batch currently */ - int offset; /* of next entry in pages */ + unsigned int capacity; /* length of pages array */ + unsigned int size; /* of batch currently */ + unsigned int offset; /* of next entry in pages */ }; struct vfio_iommu_group { @@ -293,7 +292,7 @@ static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize) struct rb_node *p; for (p = rb_prev(n); p; p = rb_prev(p)) { - struct vfio_dma *dma = rb_entry(n, + struct vfio_dma *dma = rb_entry(p, struct vfio_dma, node); vfio_dma_bitmap_free(dma); @@ -471,12 +470,12 @@ static int put_pfn(unsigned long pfn, int prot) #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *)) -static void vfio_batch_init(struct vfio_batch *batch) +static void __vfio_batch_init(struct vfio_batch *batch, bool single) { batch->size = 0; batch->offset = 0; - if (unlikely(disable_hugepages)) + if (single || unlikely(disable_hugepages)) goto fallback; batch->pages = (struct page **) __get_free_page(GFP_KERNEL); @@ -491,6 +490,16 @@ fallback: batch->capacity = 1; } +static void vfio_batch_init(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, false); +} + +static void vfio_batch_init_single(struct vfio_batch *batch) +{ + __vfio_batch_init(batch, true); +} + static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma) { while (batch->size) { @@ -510,7 +519,7 @@ static void vfio_batch_fini(struct vfio_batch *batch) static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, unsigned long vaddr, unsigned long *pfn, - bool write_fault) + unsigned long *addr_mask, bool write_fault) { struct follow_pfnmap_args args = { .vma = vma, .address = vaddr }; int ret; @@ -534,10 +543,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, return ret; } - if (write_fault && !args.writable) + if (write_fault && !args.writable) { ret = -EFAULT; - else + } else { *pfn = args.pfn; + *addr_mask = args.addr_mask; + } follow_pfnmap_end(&args); return ret; @@ -545,25 +556,33 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm, /* * Returns the positive number of pfns successfully obtained or a negative - * error code. + * error code. The initial pfn is stored in the pfn arg. For page-backed + * pfns, the provided batch is also updated to indicate the filled pages and + * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and + * returned initial pfn are provided; subsequent pfns are contiguous. */ -static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, - long npages, int prot, unsigned long *pfn, - struct page **pages) +static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr, + unsigned long npages, int prot, unsigned long *pfn, + struct vfio_batch *batch) { + unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity); struct vm_area_struct *vma; unsigned int flags = 0; - int ret; + long ret; if (prot & IOMMU_WRITE) flags |= FOLL_WRITE; mmap_read_lock(mm); - ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM, - pages, NULL); + ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM, + batch->pages, NULL); if (ret > 0) { - *pfn = page_to_pfn(pages[0]); + *pfn = page_to_pfn(batch->pages[0]); + batch->size = ret; + batch->offset = 0; goto done; + } else if (!ret) { + ret = -EFAULT; } vaddr = untagged_addr_remote(mm, vaddr); @@ -572,15 +591,22 @@ retry: vma = vma_lookup(mm, vaddr); if (vma && vma->vm_flags & VM_PFNMAP) { - ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE); + unsigned long addr_mask; + + ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask, + prot & IOMMU_WRITE); if (ret == -EAGAIN) goto retry; if (!ret) { - if (is_invalid_reserved_pfn(*pfn)) - ret = 1; - else + if (is_invalid_reserved_pfn(*pfn)) { + unsigned long epfn; + + epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1; + ret = min_t(long, npages, epfn - *pfn); + } else { ret = -EFAULT; + } } } done: @@ -594,7 +620,7 @@ done: * first page and all consecutive pages with the same locking. */ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, - long npage, unsigned long *pfn_base, + unsigned long npage, unsigned long *pfn_base, unsigned long limit, struct vfio_batch *batch) { unsigned long pfn; @@ -616,32 +642,49 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, *pfn_base = 0; } + if (unlikely(disable_hugepages)) + npage = 1; + while (npage) { if (!batch->size) { - /* Empty batch, so refill it. */ - long req_pages = min_t(long, npage, batch->capacity); + /* + * Large mappings may take a while to repeatedly refill + * the batch, so conditionally relinquish the CPU when + * needed to avoid stalls. + */ + cond_resched(); - ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot, - &pfn, batch->pages); + /* Empty batch, so refill it. */ + ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot, + &pfn, batch); if (ret < 0) goto unpin_out; - batch->size = ret; - batch->offset = 0; - if (!*pfn_base) { *pfn_base = pfn; rsvd = is_invalid_reserved_pfn(*pfn_base); } + + /* Handle pfnmap */ + if (!batch->size) { + if (pfn != *pfn_base + pinned || !rsvd) + goto out; + + pinned += ret; + npage -= ret; + vaddr += (PAGE_SIZE * ret); + iova += (PAGE_SIZE * ret); + continue; + } } /* - * pfn is preset for the first iteration of this inner loop and - * updated at the end to handle a VM_PFNMAP pfn. In that case, - * batch->pages isn't valid (there's no struct page), so allow - * batch->pages to be touched only when there's more than one - * pfn to check, which guarantees the pfns are from a - * !VM_PFNMAP vma. + * pfn is preset for the first iteration of this inner loop + * due to the fact that vaddr_get_pfns() needs to provide the + * initial pfn for pfnmaps. Therefore to reduce redundancy, + * the next pfn is fetched at the end of the loop. + * A PageReserved() page could still qualify as page backed + * and rsvd here, and therefore continues to use the batch. */ while (true) { if (pfn != *pfn_base + pinned || @@ -676,21 +719,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, pfn = page_to_pfn(batch->pages[batch->offset]); } - - if (unlikely(disable_hugepages)) - break; } out: ret = vfio_lock_acct(dma, lock_acct, false); unpin_out: - if (batch->size == 1 && !batch->offset) { - /* May be a VM_PFNMAP pfn, which the batch can't remember. */ - put_pfn(pfn, dma->prot); - batch->size = 0; - } - if (ret < 0) { if (pinned && !rsvd) { for (pfn = *pfn_base ; pinned ; pfn++, pinned--) @@ -705,7 +739,7 @@ unpin_out: } static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, - unsigned long pfn, long npage, + unsigned long pfn, unsigned long npage, bool do_accounting) { long unlocked = 0, locked = 0; @@ -728,7 +762,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { - struct page *pages[1]; + struct vfio_batch batch; struct mm_struct *mm; int ret; @@ -736,7 +770,9 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, if (!mmget_not_zero(mm)) return -ENODEV; - ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); + vfio_batch_init_single(&batch); + + ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch); if (ret != 1) goto out; @@ -755,6 +791,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, } out: + vfio_batch_fini(&batch); mmput(mm); return ret; } @@ -1064,8 +1101,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, * may require hardware cache flushing, try to find the * largest contiguous physical memory chunk to unmap. */ - for (len = PAGE_SIZE; - !domain->fgsp && iova + len < end; len += PAGE_SIZE) { + for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) { next = iommu_iova_to_phys(domain->domain, iova + len); if (next != phys + len) break; @@ -1802,49 +1838,6 @@ unwind: return ret; } -/* - * We change our unmap behavior slightly depending on whether the IOMMU - * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage - * for practically any contiguous power-of-two mapping we give it. This means - * we don't need to look for contiguous chunks ourselves to make unmapping - * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d - * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks - * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when - * hugetlbfs is in use. - */ -static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions) -{ - int ret, order = get_order(PAGE_SIZE * 2); - struct vfio_iova *region; - struct page *pages; - dma_addr_t start; - - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!pages) - return; - - list_for_each_entry(region, regions, list) { - start = ALIGN(region->start, PAGE_SIZE * 2); - if (start >= region->end || (region->end - start < PAGE_SIZE * 2)) - continue; - - ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2, - IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE, - GFP_KERNEL_ACCOUNT); - if (!ret) { - size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE); - - if (unmapped == PAGE_SIZE) - iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE); - else - domain->fgsp = true; - } - break; - } - - __free_pages(pages, order); -} - static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain, struct iommu_group *iommu_group) { @@ -2283,8 +2276,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } } - vfio_test_domain_fgsp(domain, &iova_copy); - /* replay mappings on new domains */ ret = vfio_iommu_replay(iommu, domain); if (ret) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index a5a62d9d963f..5046cae05222 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -583,7 +583,8 @@ void vfio_df_close(struct vfio_device_file *df) lockdep_assert_held(&device->dev_set->lock); - vfio_assert_device_open(device); + if (!vfio_assert_device_open(device)) + return; if (device->open_count == 1) vfio_df_device_last_close(df); device->open_count--; @@ -1751,7 +1752,7 @@ static void __exit vfio_cleanup(void) module_init(vfio_init); module_exit(vfio_cleanup); -MODULE_IMPORT_NS(IOMMUFD); +MODULE_IMPORT_NS("IOMMUFD"); MODULE_VERSION(DRIVER_VERSION); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); |