summaryrefslogtreecommitdiff
path: root/drivers/vfio
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/vfio')
-rw-r--r--drivers/vfio/cdx/main.c2
-rw-r--r--drivers/vfio/device_cdev.c98
-rw-r--r--drivers/vfio/group.c23
-rw-r--r--drivers/vfio/iommufd.c68
-rw-r--r--drivers/vfio/mdev/mdev_core.c4
-rw-r--r--drivers/vfio/pci/Kconfig6
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c122
-rw-r--r--drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h14
-rw-r--r--drivers/vfio/pci/mlx5/cmd.c418
-rw-r--r--drivers/vfio/pci/mlx5/cmd.h35
-rw-r--r--drivers/vfio/pci/mlx5/main.c90
-rw-r--r--drivers/vfio/pci/nvgrace-gpu/main.c171
-rw-r--r--drivers/vfio/pci/pds/pci_drv.c2
-rw-r--r--drivers/vfio/pci/pds/vfio_dev.c2
-rw-r--r--drivers/vfio/pci/qat/main.c7
-rw-r--r--drivers/vfio/pci/vfio_pci.c7
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c16
-rw-r--r--drivers/vfio/pci/vfio_pci_core.c96
-rw-r--r--drivers/vfio/pci/vfio_pci_igd.c5
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c12
-rw-r--r--drivers/vfio/pci/vfio_pci_priv.h6
-rw-r--r--drivers/vfio/pci/vfio_pci_rdwr.c38
-rw-r--r--drivers/vfio/pci/virtio/Kconfig6
-rw-r--r--drivers/vfio/pci/virtio/legacy_io.c4
-rw-r--r--drivers/vfio/pci/virtio/main.c8
-rw-r--r--drivers/vfio/pci/virtio/migrate.c6
-rw-r--r--drivers/vfio/platform/vfio_platform.c2
-rw-r--r--drivers/vfio/platform/vfio_platform_common.c10
-rw-r--r--drivers/vfio/vfio_iommu_type1.c181
-rw-r--r--drivers/vfio/vfio_main.c5
30 files changed, 893 insertions, 571 deletions
diff --git a/drivers/vfio/cdx/main.c b/drivers/vfio/cdx/main.c
index 67465fad5b4b..5dd5f5ad7686 100644
--- a/drivers/vfio/cdx/main.c
+++ b/drivers/vfio/cdx/main.c
@@ -347,4 +347,4 @@ module_driver(vfio_cdx_driver, cdx_driver_register, cdx_driver_unregister);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("VFIO for CDX devices - User Level meta-driver");
-MODULE_IMPORT_NS(CDX_BUS);
+MODULE_IMPORT_NS("CDX_BUS");
diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c
index bb1817bd4ff3..480cac3a0c27 100644
--- a/drivers/vfio/device_cdev.c
+++ b/drivers/vfio/device_cdev.c
@@ -60,22 +60,50 @@ static void vfio_df_get_kvm_safe(struct vfio_device_file *df)
spin_unlock(&df->kvm_ref_lock);
}
+static int vfio_df_check_token(struct vfio_device *device,
+ const struct vfio_device_bind_iommufd *bind)
+{
+ uuid_t uuid;
+
+ if (!device->ops->match_token_uuid) {
+ if (bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN)
+ return -EINVAL;
+ return 0;
+ }
+
+ if (!(bind->flags & VFIO_DEVICE_BIND_FLAG_TOKEN))
+ return device->ops->match_token_uuid(device, NULL);
+
+ if (copy_from_user(&uuid, u64_to_user_ptr(bind->token_uuid_ptr),
+ sizeof(uuid)))
+ return -EFAULT;
+ return device->ops->match_token_uuid(device, &uuid);
+}
+
long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
struct vfio_device_bind_iommufd __user *arg)
{
+ const u32 VALID_FLAGS = VFIO_DEVICE_BIND_FLAG_TOKEN;
struct vfio_device *device = df->device;
struct vfio_device_bind_iommufd bind;
unsigned long minsz;
+ u32 user_size;
int ret;
static_assert(__same_type(arg->out_devid, df->devid));
minsz = offsetofend(struct vfio_device_bind_iommufd, out_devid);
- if (copy_from_user(&bind, arg, minsz))
- return -EFAULT;
+ ret = get_user(user_size, &arg->argsz);
+ if (ret)
+ return ret;
+ if (user_size < minsz)
+ return -EINVAL;
+ ret = copy_struct_from_user(&bind, minsz, arg, user_size);
+ if (ret)
+ return ret;
- if (bind.argsz < minsz || bind.flags || bind.iommufd < 0)
+ if (bind.iommufd < 0 || bind.flags & ~VALID_FLAGS)
return -EINVAL;
/* BIND_IOMMUFD only allowed for cdev fds */
@@ -93,6 +121,10 @@ long vfio_df_ioctl_bind_iommufd(struct vfio_device_file *df,
goto out_unlock;
}
+ ret = vfio_df_check_token(device, &bind);
+ if (ret)
+ goto out_unlock;
+
df->iommufd = iommufd_ctx_from_fd(bind.iommufd);
if (IS_ERR(df->iommufd)) {
ret = PTR_ERR(df->iommufd);
@@ -162,9 +194,9 @@ void vfio_df_unbind_iommufd(struct vfio_device_file *df)
int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
struct vfio_device_attach_iommufd_pt __user *arg)
{
- struct vfio_device *device = df->device;
struct vfio_device_attach_iommufd_pt attach;
- unsigned long minsz;
+ struct vfio_device *device = df->device;
+ unsigned long minsz, xend = 0;
int ret;
minsz = offsetofend(struct vfio_device_attach_iommufd_pt, pt_id);
@@ -172,11 +204,34 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df,
if (copy_from_user(&attach, arg, minsz))
return -EFAULT;
- if (attach.argsz < minsz || attach.flags)
+ if (attach.argsz < minsz)
+ return -EINVAL;
+
+ if (attach.flags & ~VFIO_DEVICE_ATTACH_PASID)
return -EINVAL;
+ if (attach.flags & VFIO_DEVICE_ATTACH_PASID) {
+ if (!device->ops->pasid_attach_ioas)
+ return -EOPNOTSUPP;
+ xend = offsetofend(struct vfio_device_attach_iommufd_pt, pasid);
+ }
+
+ if (xend) {
+ if (attach.argsz < xend)
+ return -EINVAL;
+
+ if (copy_from_user((void *)&attach + minsz,
+ (void __user *)arg + minsz, xend - minsz))
+ return -EFAULT;
+ }
+
mutex_lock(&device->dev_set->lock);
- ret = device->ops->attach_ioas(device, &attach.pt_id);
+ if (attach.flags & VFIO_DEVICE_ATTACH_PASID)
+ ret = device->ops->pasid_attach_ioas(device,
+ attach.pasid,
+ &attach.pt_id);
+ else
+ ret = device->ops->attach_ioas(device, &attach.pt_id);
if (ret)
goto out_unlock;
@@ -198,20 +253,41 @@ out_unlock:
int vfio_df_ioctl_detach_pt(struct vfio_device_file *df,
struct vfio_device_detach_iommufd_pt __user *arg)
{
- struct vfio_device *device = df->device;
struct vfio_device_detach_iommufd_pt detach;
- unsigned long minsz;
+ struct vfio_device *device = df->device;
+ unsigned long minsz, xend = 0;
minsz = offsetofend(struct vfio_device_detach_iommufd_pt, flags);
if (copy_from_user(&detach, arg, minsz))
return -EFAULT;
- if (detach.argsz < minsz || detach.flags)
+ if (detach.argsz < minsz)
+ return -EINVAL;
+
+ if (detach.flags & ~VFIO_DEVICE_DETACH_PASID)
return -EINVAL;
+ if (detach.flags & VFIO_DEVICE_DETACH_PASID) {
+ if (!device->ops->pasid_detach_ioas)
+ return -EOPNOTSUPP;
+ xend = offsetofend(struct vfio_device_detach_iommufd_pt, pasid);
+ }
+
+ if (xend) {
+ if (detach.argsz < xend)
+ return -EINVAL;
+
+ if (copy_from_user((void *)&detach + minsz,
+ (void __user *)arg + minsz, xend - minsz))
+ return -EFAULT;
+ }
+
mutex_lock(&device->dev_set->lock);
- device->ops->detach_ioas(device);
+ if (detach.flags & VFIO_DEVICE_DETACH_PASID)
+ device->ops->pasid_detach_ioas(device, detach.pasid);
+ else
+ device->ops->detach_ioas(device);
mutex_unlock(&device->dev_set->lock);
return 0;
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 49559605177e..c376a6279de0 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -192,11 +192,10 @@ static int vfio_df_group_open(struct vfio_device_file *df)
* implies they expected translation to exist
*/
if (!capable(CAP_SYS_RAWIO) ||
- vfio_iommufd_device_has_compat_ioas(device, df->iommufd))
+ vfio_iommufd_device_has_compat_ioas(device, df->iommufd)) {
ret = -EPERM;
- else
- ret = 0;
- goto out_put_kvm;
+ goto out_put_kvm;
+ }
}
ret = vfio_df_open(df);
@@ -266,24 +265,12 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
if (ret)
goto err_free;
- /*
- * We can't use anon_inode_getfd() because we need to modify
- * the f_mode flags directly to allow more than just ioctls
- */
- filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
- df, O_RDWR);
+ filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops,
+ df, O_RDWR, FMODE_PREAD | FMODE_PWRITE);
if (IS_ERR(filep)) {
ret = PTR_ERR(filep);
goto err_close_device;
}
-
- /*
- * TODO: add an anon_inode interface to do this.
- * Appears to be missing by lack of need rather than
- * explicitly prevented. Now there's need.
- */
- filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
-
/*
* Use the pseudo fs inode on the device to link all mmaps
* to the same address space, allowing us to unmap all vmas
diff --git a/drivers/vfio/iommufd.c b/drivers/vfio/iommufd.c
index 82eba6966fa5..a38d262c6028 100644
--- a/drivers/vfio/iommufd.c
+++ b/drivers/vfio/iommufd.c
@@ -7,8 +7,8 @@
#include "vfio.h"
-MODULE_IMPORT_NS(IOMMUFD);
-MODULE_IMPORT_NS(IOMMUFD_VFIO);
+MODULE_IMPORT_NS("IOMMUFD");
+MODULE_IMPORT_NS("IOMMUFD_VFIO");
bool vfio_iommufd_device_has_compat_ioas(struct vfio_device *vdev,
struct iommufd_ctx *ictx)
@@ -25,6 +25,10 @@ int vfio_df_iommufd_bind(struct vfio_device_file *df)
lockdep_assert_held(&vdev->dev_set->lock);
+ /* Returns 0 to permit device opening under noiommu mode */
+ if (vfio_device_is_noiommu(vdev))
+ return 0;
+
return vdev->ops->bind_iommufd(vdev, ictx, &df->devid);
}
@@ -119,16 +123,24 @@ int vfio_iommufd_physical_bind(struct vfio_device *vdev,
if (IS_ERR(idev))
return PTR_ERR(idev);
vdev->iommufd_device = idev;
+ ida_init(&vdev->pasids);
return 0;
}
EXPORT_SYMBOL_GPL(vfio_iommufd_physical_bind);
void vfio_iommufd_physical_unbind(struct vfio_device *vdev)
{
+ int pasid;
+
lockdep_assert_held(&vdev->dev_set->lock);
+ while ((pasid = ida_find_first(&vdev->pasids)) >= 0) {
+ iommufd_device_detach(vdev->iommufd_device, pasid);
+ ida_free(&vdev->pasids, pasid);
+ }
+
if (vdev->iommufd_attached) {
- iommufd_device_detach(vdev->iommufd_device);
+ iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID);
vdev->iommufd_attached = false;
}
iommufd_device_unbind(vdev->iommufd_device);
@@ -146,9 +158,11 @@ int vfio_iommufd_physical_attach_ioas(struct vfio_device *vdev, u32 *pt_id)
return -EINVAL;
if (vdev->iommufd_attached)
- rc = iommufd_device_replace(vdev->iommufd_device, pt_id);
+ rc = iommufd_device_replace(vdev->iommufd_device,
+ IOMMU_NO_PASID, pt_id);
else
- rc = iommufd_device_attach(vdev->iommufd_device, pt_id);
+ rc = iommufd_device_attach(vdev->iommufd_device,
+ IOMMU_NO_PASID, pt_id);
if (rc)
return rc;
vdev->iommufd_attached = true;
@@ -163,11 +177,53 @@ void vfio_iommufd_physical_detach_ioas(struct vfio_device *vdev)
if (WARN_ON(!vdev->iommufd_device) || !vdev->iommufd_attached)
return;
- iommufd_device_detach(vdev->iommufd_device);
+ iommufd_device_detach(vdev->iommufd_device, IOMMU_NO_PASID);
vdev->iommufd_attached = false;
}
EXPORT_SYMBOL_GPL(vfio_iommufd_physical_detach_ioas);
+int vfio_iommufd_physical_pasid_attach_ioas(struct vfio_device *vdev,
+ u32 pasid, u32 *pt_id)
+{
+ int rc;
+
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return -EINVAL;
+
+ if (ida_exists(&vdev->pasids, pasid))
+ return iommufd_device_replace(vdev->iommufd_device,
+ pasid, pt_id);
+
+ rc = ida_alloc_range(&vdev->pasids, pasid, pasid, GFP_KERNEL);
+ if (rc < 0)
+ return rc;
+
+ rc = iommufd_device_attach(vdev->iommufd_device, pasid, pt_id);
+ if (rc)
+ ida_free(&vdev->pasids, pasid);
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_attach_ioas);
+
+void vfio_iommufd_physical_pasid_detach_ioas(struct vfio_device *vdev,
+ u32 pasid)
+{
+ lockdep_assert_held(&vdev->dev_set->lock);
+
+ if (WARN_ON(!vdev->iommufd_device))
+ return;
+
+ if (!ida_exists(&vdev->pasids, pasid))
+ return;
+
+ iommufd_device_detach(vdev->iommufd_device, pasid);
+ ida_free(&vdev->pasids, pasid);
+}
+EXPORT_SYMBOL_GPL(vfio_iommufd_physical_pasid_detach_ioas);
+
/*
* The emulated standard ops mean that vfio_device is going to use the
* "mdev path" and will call vfio_pin_pages()/vfio_dma_rw(). Drivers using this
diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c
index ed4737de4528..f2e686f8f1ef 100644
--- a/drivers/vfio/mdev/mdev_core.c
+++ b/drivers/vfio/mdev/mdev_core.c
@@ -76,7 +76,7 @@ int mdev_register_parent(struct mdev_parent *parent, struct device *dev,
if (ret)
return ret;
- ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL);
+ ret = class_compat_create_link(mdev_bus_compat_class, dev);
if (ret)
dev_warn(dev, "Failed to create compatibility class link\n");
@@ -98,7 +98,7 @@ void mdev_unregister_parent(struct mdev_parent *parent)
dev_info(parent->dev, "MDEV: Unregistering\n");
down_write(&parent->unreg_sem);
- class_compat_remove_link(mdev_bus_compat_class, parent->dev, NULL);
+ class_compat_remove_link(mdev_bus_compat_class, parent->dev);
device_for_each_child(parent->dev, NULL, mdev_device_remove_cb);
parent_remove_sysfs_files(parent);
up_write(&parent->unreg_sem);
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index bf50ffa10bde..2b0172f54665 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -1,16 +1,12 @@
# SPDX-License-Identifier: GPL-2.0-only
menu "VFIO support for PCI devices"
- depends on PCI && MMU
+ depends on PCI
config VFIO_PCI_CORE
tristate
select VFIO_VIRQFD
select IRQ_BYPASS_MANAGER
-config VFIO_PCI_MMAP
- def_bool y if !S390
- depends on VFIO_PCI_CORE
-
config VFIO_PCI_INTX
def_bool y if !S390
depends on VFIO_PCI_CORE
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
index 451c639299eb..397f5e445136 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c
@@ -190,9 +190,10 @@ static int qm_set_regs(struct hisi_qm *qm, struct acc_vf_data *vf_data)
int ret;
/* Check VF state */
- if (unlikely(hisi_qm_wait_mb_ready(qm))) {
+ ret = hisi_qm_wait_mb_ready(qm);
+ if (unlikely(ret)) {
dev_err(&qm->pdev->dev, "QM device is not ready to write\n");
- return -EBUSY;
+ return ret;
}
ret = qm_write_regs(qm, QM_VF_AEQ_INT_MASK, &vf_data->aeq_int_mask, 1);
@@ -325,13 +326,15 @@ static void qm_dev_cmd_init(struct hisi_qm *qm)
static int vf_qm_cache_wb(struct hisi_qm *qm)
{
unsigned int val;
+ int ret;
writel(0x1, qm->io_base + QM_CACHE_WB_START);
- if (readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
+ ret = readl_relaxed_poll_timeout(qm->io_base + QM_CACHE_WB_DONE,
val, val & BIT(0), MB_POLL_PERIOD_US,
- MB_POLL_TIMEOUT_US)) {
+ MB_POLL_TIMEOUT_US);
+ if (ret) {
dev_err(&qm->pdev->dev, "vf QM writeback sqc cache fail\n");
- return -EINVAL;
+ return ret;
}
return 0;
@@ -350,6 +353,32 @@ static int vf_qm_func_stop(struct hisi_qm *qm)
return hisi_qm_mb(qm, QM_MB_CMD_PAUSE_QM, 0, 0, 0);
}
+static int vf_qm_version_check(struct acc_vf_data *vf_data, struct device *dev)
+{
+ switch (vf_data->acc_magic) {
+ case ACC_DEV_MAGIC_V2:
+ if (vf_data->major_ver != ACC_DRV_MAJOR_VER) {
+ dev_info(dev, "migration driver version<%u.%u> not match!\n",
+ vf_data->major_ver, vf_data->minor_ver);
+ return -EINVAL;
+ }
+ break;
+ case ACC_DEV_MAGIC_V1:
+ /* Correct dma address */
+ vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH];
+ vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW];
+ vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH];
+ vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
+ vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW];
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
struct hisi_acc_vf_migration_file *migf)
{
@@ -363,9 +392,10 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
if (migf->total_length < QM_MATCH_SIZE || hisi_acc_vdev->match_done)
return 0;
- if (vf_data->acc_magic != ACC_DEV_MAGIC) {
+ ret = vf_qm_version_check(vf_data, dev);
+ if (ret) {
dev_err(dev, "failed to match ACC_DEV_MAGIC\n");
- return -EINVAL;
+ return ret;
}
if (vf_data->dev_id != hisi_acc_vdev->vf_dev->device) {
@@ -377,7 +407,7 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
ret = qm_get_vft(vf_qm, &vf_qm->qp_base);
if (ret <= 0) {
dev_err(dev, "failed to get vft qp nums\n");
- return -EINVAL;
+ return ret;
}
if (ret != vf_data->qp_num) {
@@ -399,13 +429,6 @@ static int vf_qm_check_match(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return -EINVAL;
}
- ret = qm_write_regs(vf_qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
- if (ret) {
- dev_err(dev, "failed to write QM_VF_STATE\n");
- return ret;
- }
-
- hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
hisi_acc_vdev->match_done = true;
return 0;
}
@@ -418,7 +441,9 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
int vf_id = hisi_acc_vdev->vf_id;
int ret;
- vf_data->acc_magic = ACC_DEV_MAGIC;
+ vf_data->acc_magic = ACC_DEV_MAGIC_V2;
+ vf_data->major_ver = ACC_DRV_MAJOR_VER;
+ vf_data->minor_ver = ACC_DRV_MINOR_VER;
/* Save device id */
vf_data->dev_id = hisi_acc_vdev->vf_dev->device;
@@ -441,6 +466,19 @@ static int vf_qm_get_match_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
return 0;
}
+static void vf_qm_xeqc_save(struct hisi_qm *qm,
+ struct hisi_acc_vf_migration_file *migf)
+{
+ struct acc_vf_data *vf_data = &migf->vf_data;
+ u16 eq_head, aeq_head;
+
+ eq_head = vf_data->qm_eqc_dw[0] & 0xFFFF;
+ qm_db(qm, 0, QM_DOORBELL_CMD_EQ, eq_head, 0);
+
+ aeq_head = vf_data->qm_aeqc_dw[0] & 0xFFFF;
+ qm_db(qm, 0, QM_DOORBELL_CMD_AEQ, aeq_head, 0);
+}
+
static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
struct hisi_acc_vf_migration_file *migf)
{
@@ -456,6 +494,20 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev,
if (migf->total_length < sizeof(struct acc_vf_data))
return -EINVAL;
+ if (!vf_data->eqe_dma || !vf_data->aeqe_dma ||
+ !vf_data->sqc_dma || !vf_data->cqc_dma) {
+ dev_info(dev, "resume dma addr is NULL!\n");
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
+ return 0;
+ }
+
+ ret = qm_write_regs(qm, QM_VF_STATE, &vf_data->vf_qm_state, 1);
+ if (ret) {
+ dev_err(dev, "failed to write QM_VF_STATE\n");
+ return ret;
+ }
+ hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
+
qm->eqe_dma = vf_data->eqe_dma;
qm->aeqe_dma = vf_data->aeqe_dma;
qm->sqc_dma = vf_data->sqc_dma;
@@ -493,27 +545,27 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data)
ret = qm_get_regs(vf_qm, vf_data);
if (ret)
- return -EINVAL;
+ return ret;
/* Every reg is 32 bit, the dma address is 64 bit. */
- vf_data->eqe_dma = vf_data->qm_eqc_dw[1];
+ vf_data->eqe_dma = vf_data->qm_eqc_dw[QM_XQC_ADDR_HIGH];
vf_data->eqe_dma <<= QM_XQC_ADDR_OFFSET;
- vf_data->eqe_dma |= vf_data->qm_eqc_dw[0];
- vf_data->aeqe_dma = vf_data->qm_aeqc_dw[1];
+ vf_data->eqe_dma |= vf_data->qm_eqc_dw[QM_XQC_ADDR_LOW];
+ vf_data->aeqe_dma = vf_data->qm_aeqc_dw[QM_XQC_ADDR_HIGH];
vf_data->aeqe_dma <<= QM_XQC_ADDR_OFFSET;
- vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[0];
+ vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW];
/* Through SQC_BT/CQC_BT to get sqc and cqc address */
ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma);
if (ret) {
dev_err(dev, "failed to read SQC addr!\n");
- return -EINVAL;
+ return ret;
}
ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma);
if (ret) {
dev_err(dev, "failed to read CQC addr!\n");
- return -EINVAL;
+ return ret;
}
return 0;
@@ -524,7 +576,6 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
{
struct acc_vf_data *vf_data = &migf->vf_data;
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
- struct device *dev = &vf_qm->pdev->dev;
int ret;
if (unlikely(qm_wait_dev_not_ready(vf_qm))) {
@@ -538,17 +589,14 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev,
vf_data->vf_qm_state = QM_READY;
hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state;
- ret = vf_qm_cache_wb(vf_qm);
- if (ret) {
- dev_err(dev, "failed to writeback QM Cache!\n");
- return ret;
- }
-
ret = vf_qm_read_data(vf_qm, vf_data);
if (ret)
- return -EINVAL;
+ return ret;
migf->total_length = sizeof(struct acc_vf_data);
+ /* Save eqc and aeqc interrupt information */
+ vf_qm_xeqc_save(vf_qm, migf);
+
return 0;
}
@@ -967,6 +1015,13 @@ static int hisi_acc_vf_stop_device(struct hisi_acc_vf_core_device *hisi_acc_vdev
dev_err(dev, "failed to check QM INT state!\n");
return ret;
}
+
+ ret = vf_qm_cache_wb(vf_qm);
+ if (ret) {
+ dev_err(dev, "failed to writeback QM cache!\n");
+ return ret;
+ }
+
return 0;
}
@@ -1327,7 +1382,7 @@ static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vde
ret = qm_wait_dev_not_ready(vf_qm);
if (ret) {
seq_puts(seq, "VF device not ready!\n");
- return -EBUSY;
+ return ret;
}
return 0;
@@ -1463,6 +1518,7 @@ static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev)
struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev);
struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm;
+ hisi_acc_vf_disable_fds(hisi_acc_vdev);
mutex_lock(&hisi_acc_vdev->open_mutex);
hisi_acc_vdev->dev_opened = false;
iounmap(vf_qm->io_base);
@@ -1485,6 +1541,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev)
hisi_acc_vdev->vf_id = pci_iov_vf_id(pdev) + 1;
hisi_acc_vdev->pf_qm = pf_qm;
hisi_acc_vdev->vf_dev = pdev;
+ hisi_acc_vdev->vf_qm_state = QM_NOT_READY;
mutex_init(&hisi_acc_vdev->state_mutex);
mutex_init(&hisi_acc_vdev->open_mutex);
@@ -1526,6 +1583,7 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
index 245d7537b2bc..91002ceeebc1 100644
--- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
+++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h
@@ -39,6 +39,9 @@
#define QM_REG_ADDR_OFFSET 0x0004
#define QM_XQC_ADDR_OFFSET 32U
+#define QM_XQC_ADDR_LOW 0x1
+#define QM_XQC_ADDR_HIGH 0x2
+
#define QM_VF_AEQ_INT_MASK 0x0004
#define QM_VF_EQ_INT_MASK 0x000c
#define QM_IFC_INT_SOURCE_V 0x0020
@@ -50,10 +53,15 @@
#define QM_EQC_DW0 0X8000
#define QM_AEQC_DW0 0X8020
+#define ACC_DRV_MAJOR_VER 1
+#define ACC_DRV_MINOR_VER 0
+
+#define ACC_DEV_MAGIC_V1 0XCDCDCDCDFEEDAACC
+#define ACC_DEV_MAGIC_V2 0xAACCFEEDDECADEDE
+
struct acc_vf_data {
#define QM_MATCH_SIZE offsetofend(struct acc_vf_data, qm_rsv_state)
/* QM match information */
-#define ACC_DEV_MAGIC 0XCDCDCDCDFEEDAACC
u64 acc_magic;
u32 qp_num;
u32 dev_id;
@@ -61,7 +69,9 @@ struct acc_vf_data {
u32 qp_base;
u32 vf_qm_state;
/* QM reserved match information */
- u32 qm_rsv_state[3];
+ u16 major_ver;
+ u16 minor_ver;
+ u32 qm_rsv_state[2];
/* QM RW regs */
u32 aeq_int_mask;
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 7527e277c898..a92b095b90f6 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -313,40 +313,21 @@ err_exec:
return ret;
}
-static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vhca_data_buffer *buf,
- struct mlx5_vhca_recv_buf *recv_buf,
- u32 *mkey)
+static u32 *alloc_mkey_in(u32 npages, u32 pdn)
{
- size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
- recv_buf->npages;
- int err = 0, inlen;
- __be64 *mtt;
+ int inlen;
void *mkc;
u32 *in;
inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
- sizeof(*mtt) * round_up(npages, 2);
+ sizeof(__be64) * round_up(npages, 2);
- in = kvzalloc(inlen, GFP_KERNEL);
+ in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
if (!in)
- return -ENOMEM;
+ return NULL;
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
DIV_ROUND_UP(npages, 2));
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-
- if (buf) {
- struct sg_dma_page_iter dma_iter;
-
- for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
- *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
- } else {
- int i;
-
- for (i = 0; i < npages; i++)
- *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
- }
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
@@ -360,8 +341,81 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
- err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
- kvfree(in);
+
+ return in;
+}
+
+static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in,
+ u32 *mkey)
+{
+ int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(__be64) * round_up(npages, 2);
+
+ return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
+}
+
+static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ u32 *mkey_in, struct dma_iova_state *state,
+ enum dma_data_direction dir)
+{
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i;
+
+ if (dma_use_iova(state)) {
+ dma_iova_destroy(mdev->device, state, npages * PAGE_SIZE, dir,
+ 0);
+ } else {
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in,
+ klm_pas_mtt);
+ for (i = npages - 1; i >= 0; i--) {
+ addr = be64_to_cpu(mtt[i]);
+ dma_unmap_page(mdev->device, addr, PAGE_SIZE, dir);
+ }
+ }
+}
+
+static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ struct page **page_list, u32 *mkey_in,
+ struct dma_iova_state *state,
+ enum dma_data_direction dir)
+{
+ dma_addr_t addr;
+ size_t mapped = 0;
+ __be64 *mtt;
+ int i, err;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+
+ if (dma_iova_try_alloc(mdev->device, state, 0, npages * PAGE_SIZE)) {
+ addr = state->addr;
+ for (i = 0; i < npages; i++) {
+ err = dma_iova_link(mdev->device, state,
+ page_to_phys(page_list[i]), mapped,
+ PAGE_SIZE, dir, 0);
+ if (err)
+ goto error;
+ *mtt++ = cpu_to_be64(addr);
+ addr += PAGE_SIZE;
+ mapped += PAGE_SIZE;
+ }
+ err = dma_iova_sync(mdev->device, state, 0, mapped);
+ if (err)
+ goto error;
+ } else {
+ for (i = 0; i < npages; i++) {
+ addr = dma_map_page(mdev->device, page_list[i], 0,
+ PAGE_SIZE, dir);
+ err = dma_mapping_error(mdev->device, addr);
+ if (err)
+ goto error;
+ *mtt++ = cpu_to_be64(addr);
+ }
+ }
+ return 0;
+
+error:
+ unregister_dma_pages(mdev, i, mkey_in, state, dir);
return err;
}
@@ -375,97 +429,97 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (buf->dmaed || !buf->allocated_length)
+ if (buf->mkey_in || !buf->npages)
return -EINVAL;
- ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
- if (ret)
- return ret;
+ buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
+ if (!buf->mkey_in)
+ return -ENOMEM;
- ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
+ ret = register_dma_pages(mdev, buf->npages, buf->page_list,
+ buf->mkey_in, &buf->state, buf->dma_dir);
if (ret)
- goto err;
+ goto err_register_dma;
- buf->dmaed = true;
+ ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey);
+ if (ret)
+ goto err_create_mkey;
return 0;
-err:
- dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
+
+err_create_mkey:
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->state,
+ buf->dma_dir);
+err_register_dma:
+ kvfree(buf->mkey_in);
+ buf->mkey_in = NULL;
return ret;
}
+static void free_page_list(u32 npages, struct page **page_list)
+{
+ int i;
+
+ /* Undo alloc_pages_bulk() */
+ for (i = npages - 1; i >= 0; i--)
+ __free_page(page_list[i]);
+
+ kvfree(page_list);
+}
+
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
{
- struct mlx5_vf_migration_file *migf = buf->migf;
- struct sg_page_iter sg_iter;
+ struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
+ struct mlx5_core_dev *mdev = mvdev->mdev;
- lockdep_assert_held(&migf->mvdev->state_mutex);
- WARN_ON(migf->mvdev->mdev_detach);
+ lockdep_assert_held(&mvdev->state_mutex);
+ WARN_ON(mvdev->mdev_detach);
- if (buf->dmaed) {
- mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
- dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
- buf->dma_dir, 0);
+ if (buf->mkey_in) {
+ mlx5_core_destroy_mkey(mdev, buf->mkey);
+ unregister_dma_pages(mdev, buf->npages, buf->mkey_in,
+ &buf->state, buf->dma_dir);
+ kvfree(buf->mkey_in);
}
- /* Undo alloc_pages_bulk_array() */
- for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
- __free_page(sg_page_iter_page(&sg_iter));
- sg_free_append_table(&buf->table);
+ free_page_list(buf->npages, buf->page_list);
kfree(buf);
}
-static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
- unsigned int npages)
+static int mlx5vf_add_pages(struct page ***page_list, unsigned int npages)
{
- unsigned int to_alloc = npages;
- struct page **page_list;
- unsigned long filled;
- unsigned int to_fill;
- int ret;
+ unsigned int filled, done = 0;
int i;
- to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
- page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
- if (!page_list)
+ *page_list =
+ kvcalloc(npages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ if (!*page_list)
return -ENOMEM;
- do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
- if (!filled) {
- ret = -ENOMEM;
+ for (;;) {
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, npages - done,
+ *page_list + done);
+ if (!filled)
goto err;
- }
- to_alloc -= filled;
- ret = sg_alloc_append_table_from_pages(
- &buf->table, page_list, filled, 0,
- filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
- GFP_KERNEL_ACCOUNT);
- if (ret)
- goto err_append;
- buf->allocated_length += filled * PAGE_SIZE;
- /* clean input for another bulk allocation */
- memset(page_list, 0, filled * sizeof(*page_list));
- to_fill = min_t(unsigned int, to_alloc,
- PAGE_SIZE / sizeof(*page_list));
- } while (to_alloc > 0);
+ done += filled;
+ if (done == npages)
+ break;
+ }
- kvfree(page_list);
return 0;
-err_append:
- for (i = filled - 1; i >= 0; i--)
- __free_page(page_list[i]);
err:
- kvfree(page_list);
- return ret;
+ for (i = 0; i < done; i++)
+ __free_page(*page_list[i]);
+
+ kvfree(*page_list);
+ *page_list = NULL;
+ return -ENOMEM;
}
struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length,
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf;
@@ -477,12 +531,13 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
buf->dma_dir = dma_dir;
buf->migf = migf;
- if (length) {
- ret = mlx5vf_add_migration_pages(buf,
- DIV_ROUND_UP_ULL(length, PAGE_SIZE));
+ if (npages) {
+ ret = mlx5vf_add_pages(&buf->page_list, npages);
if (ret)
goto end;
+ buf->npages = npages;
+
if (dma_dir != DMA_NONE) {
ret = mlx5vf_dma_data_buffer(buf);
if (ret)
@@ -505,8 +560,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
}
struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir)
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir)
{
struct mlx5_vhca_data_buffer *buf, *temp_buf;
struct list_head free_list;
@@ -521,7 +576,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
if (buf->dma_dir == dma_dir) {
list_del_init(&buf->buf_elm);
- if (buf->allocated_length >= length) {
+ if (buf->npages >= npages) {
spin_unlock_irq(&migf->list_lock);
goto found;
}
@@ -535,7 +590,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
}
}
spin_unlock_irq(&migf->list_lock);
- buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
+ buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir);
found:
while ((temp_buf = list_first_entry_or_null(&free_list,
@@ -716,7 +771,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
MLX5_SET(save_vhca_state_in, in, op_mod, 0);
MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
- MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
+ MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE);
MLX5_SET(save_vhca_state_in, in, incremental, inc);
MLX5_SET(save_vhca_state_in, in, set_track, track);
@@ -738,8 +793,11 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
}
if (!header_buf) {
- header_buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ header_buf = mlx5vf_get_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(header_buf)) {
err = PTR_ERR(header_buf);
goto err_free;
@@ -783,7 +841,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (!buf->dmaed) {
+ if (!buf->mkey_in) {
err = mlx5vf_dma_data_buffer(buf);
if (err)
return err;
@@ -1338,103 +1396,16 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
kfree(qp);
}
-static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- /* Undo alloc_pages_bulk_array() */
- for (i = 0; i < recv_buf->npages; i++)
- __free_page(recv_buf->page_list[i]);
-
- kvfree(recv_buf->page_list);
-}
-
-static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
- unsigned int npages)
-{
- unsigned int filled = 0, done = 0;
- int i;
-
- recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->page_list)
- return -ENOMEM;
-
- for (;;) {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
- npages - done,
- recv_buf->page_list + done);
- if (!filled)
- goto err;
-
- done += filled;
- if (done == npages)
- break;
- }
-
- recv_buf->npages = npages;
- return 0;
-
-err:
- for (i = 0; i < npages; i++) {
- if (recv_buf->page_list[i])
- __free_page(recv_buf->page_list[i]);
- }
-
- kvfree(recv_buf->page_list);
- return -ENOMEM;
-}
-
-static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i, j;
-
- recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
- sizeof(*recv_buf->dma_addrs),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->dma_addrs)
- return -ENOMEM;
-
- for (i = 0; i < recv_buf->npages; i++) {
- recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
- recv_buf->page_list[i],
- 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
- goto error;
- }
- return 0;
-
-error:
- for (j = 0; j < i; j++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
- return -ENOMEM;
-}
-
-static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- for (i = 0; i < recv_buf->npages; i++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
-}
-
static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_qp *qp)
{
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
- unregister_dma_recv_pages(mdev, recv_buf);
- free_recv_pages(&qp->recv_buf);
+ unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in,
+ &recv_buf->state, DMA_FROM_DEVICE);
+ kvfree(recv_buf->mkey_in);
+ free_page_list(recv_buf->npages, recv_buf->page_list);
}
static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
@@ -1445,24 +1416,38 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
int err;
- err = alloc_recv_pages(recv_buf, npages);
- if (err < 0)
+ err = mlx5vf_add_pages(&recv_buf->page_list, npages);
+ if (err)
return err;
- err = register_dma_recv_pages(mdev, recv_buf);
- if (err)
+ recv_buf->npages = npages;
+
+ recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
+ if (!recv_buf->mkey_in) {
+ err = -ENOMEM;
goto end;
+ }
+
+ err = register_dma_pages(mdev, npages, recv_buf->page_list,
+ recv_buf->mkey_in, &recv_buf->state,
+ DMA_FROM_DEVICE);
+ if (err)
+ goto err_register_dma;
- err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+ err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey);
if (err)
goto err_create_mkey;
return 0;
err_create_mkey:
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->state,
+ DMA_FROM_DEVICE);
+err_register_dma:
+ kvfree(recv_buf->mkey_in);
+ recv_buf->mkey_in = NULL;
end:
- free_recv_pages(recv_buf);
+ free_page_list(npages, recv_buf->page_list);
return err;
}
@@ -1517,7 +1502,8 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
struct mlx5_vhca_qp *host_qp;
struct mlx5_vhca_qp *fw_qp;
struct mlx5_core_dev *mdev;
- u32 max_msg_size = PAGE_SIZE;
+ u32 log_max_msg_size;
+ u32 max_msg_size;
u64 rq_size = SZ_2M;
u32 max_recv_wr;
int err;
@@ -1534,6 +1520,12 @@ int mlx5vf_start_page_tracker(struct vfio_device *vdev,
}
mdev = mvdev->mdev;
+ log_max_msg_size = MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_msg_size);
+ max_msg_size = (1ULL << log_max_msg_size);
+ /* The RQ must hold at least 4 WQEs/messages for successful QP creation */
+ if (rq_size < 4ULL * max_msg_size)
+ rq_size = 4ULL * max_msg_size;
+
memset(tracker, 0, sizeof(*tracker));
tracker->uar = mlx5_get_uars_page(mdev);
if (IS_ERR(tracker->uar)) {
@@ -1623,25 +1615,41 @@ set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
{
u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
u32 nent = size / entry_size;
+ u32 nent_in_page;
+ u32 nent_to_set;
struct page *page;
+ u32 page_offset;
+ u32 page_index;
+ u32 buf_offset;
+ void *kaddr;
u64 addr;
u64 *buf;
int i;
- if (WARN_ON(index >= qp->recv_buf.npages ||
+ buf_offset = index * qp->max_msg_size;
+ if (WARN_ON(buf_offset + size >= qp->recv_buf.npages * PAGE_SIZE ||
(nent > qp->max_msg_size / entry_size)))
return;
- page = qp->recv_buf.page_list[index];
- buf = kmap_local_page(page);
- for (i = 0; i < nent; i++) {
- addr = MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_low);
- addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
- dirty_address_high) << 32;
- iova_bitmap_set(dirty, addr, qp->tracked_page_size);
- }
- kunmap_local(buf);
+ do {
+ page_index = buf_offset / PAGE_SIZE;
+ page_offset = buf_offset % PAGE_SIZE;
+ nent_in_page = (PAGE_SIZE - page_offset) / entry_size;
+ page = qp->recv_buf.page_list[page_index];
+ kaddr = kmap_local_page(page);
+ buf = kaddr + page_offset;
+ nent_to_set = min(nent, nent_in_page);
+ for (i = 0; i < nent_to_set; i++) {
+ addr = MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_low);
+ addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
+ dirty_address_high) << 32;
+ iova_bitmap_set(dirty, addr, qp->tracked_page_size);
+ }
+ kunmap_local(kaddr);
+ buf_offset += (nent_to_set * entry_size);
+ nent -= nent_to_set;
+ } while (nent);
}
static void
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index df421dc6de04..d7821b5ca772 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -53,20 +53,17 @@ struct mlx5_vf_migration_header {
};
struct mlx5_vhca_data_buffer {
- struct sg_append_table table;
+ struct page **page_list;
+ struct dma_iova_state state;
loff_t start_pos;
u64 length;
- u64 allocated_length;
+ u32 npages;
u32 mkey;
+ u32 *mkey_in;
enum dma_data_direction dma_dir;
- u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
- /* Optimize mlx5vf_get_migration_page() for sequential access */
- struct scatterlist *last_offset_sg;
- unsigned int sg_last_entry;
- unsigned long last_offset;
};
struct mlx5vf_async_data {
@@ -133,8 +130,9 @@ struct mlx5_vhca_cq {
struct mlx5_vhca_recv_buf {
u32 npages;
struct page **page_list;
- dma_addr_t *dma_addrs;
+ struct dma_iova_state state;
u32 next_rq_offset;
+ u32 *mkey_in;
u32 mkey;
};
@@ -217,15 +215,24 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf);
void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf);
struct mlx5_vhca_data_buffer *
-mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf);
struct mlx5_vhca_data_buffer *
-mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
- size_t length, enum dma_data_direction dma_dir);
+mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages,
+ enum dma_data_direction dma_dir);
void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf);
-struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
- unsigned long offset);
+static inline struct page *
+mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
+ unsigned long offset)
+{
+ int page_entry = offset / PAGE_SIZE;
+
+ if (page_entry >= buf->npages)
+ return NULL;
+
+ return buf->page_list[page_entry];
+}
void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev);
void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
enum mlx5_vf_migf_state *last_save_state);
diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c
index 8833e60d42f5..7ec47e736a8e 100644
--- a/drivers/vfio/pci/mlx5/main.c
+++ b/drivers/vfio/pci/mlx5/main.c
@@ -34,37 +34,6 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
core_device);
}
-struct page *
-mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
- unsigned long offset)
-{
- unsigned long cur_offset = 0;
- struct scatterlist *sg;
- unsigned int i;
-
- /* All accesses are sequential */
- if (offset < buf->last_offset || !buf->last_offset_sg) {
- buf->last_offset = 0;
- buf->last_offset_sg = buf->table.sgt.sgl;
- buf->sg_last_entry = 0;
- }
-
- cur_offset = buf->last_offset;
-
- for_each_sg(buf->last_offset_sg, sg,
- buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
- if (offset < sg->length + cur_offset) {
- buf->last_offset_sg = sg;
- buf->sg_last_entry += i;
- buf->last_offset = cur_offset;
- return nth_page(sg_page(sg),
- (offset - cur_offset) / PAGE_SIZE);
- }
- cur_offset += sg->length;
- }
- return NULL;
-}
-
static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
{
mutex_lock(&migf->lock);
@@ -308,6 +277,7 @@ static struct mlx5_vhca_data_buffer *
mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
u8 index, size_t required_length)
{
+ u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE);
struct mlx5_vhca_data_buffer *buf = migf->buf[index];
u8 chunk_num;
@@ -315,12 +285,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
chunk_num = buf->stop_copy_chunk_num;
buf->migf->buf[index] = NULL;
/* Checking whether the pre-allocated buffer can fit */
- if (buf->allocated_length >= required_length)
+ if (buf->npages >= npages)
return buf;
mlx5vf_put_data_buffer(buf);
- buf = mlx5vf_get_data_buffer(buf->migf, required_length,
- DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE);
if (IS_ERR(buf))
return buf;
@@ -373,7 +342,8 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
u8 *to_buff;
int ret;
- header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
+ header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(header_buf))
return PTR_ERR(header_buf);
@@ -388,7 +358,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
to_buff = kmap_local_page(page);
memcpy(to_buff, &header, sizeof(header));
header_buf->length = sizeof(header);
- data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
+ data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE);
memcpy(to_buff + sizeof(header), &data, sizeof(data));
header_buf->length += sizeof(data);
kunmap_local(to_buff);
@@ -437,15 +407,20 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
for (i = 0; i < num_chunks; i++) {
- buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(
+ migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
}
migf->buf[i] = buf;
- buf = mlx5vf_get_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_get_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto err;
@@ -553,7 +528,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
* We finished transferring the current state and the device has a
* dirty state, save a new state to be ready for.
*/
- buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE),
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
mlx5vf_mark_err(migf);
@@ -675,8 +651,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
if (track) {
/* leave the allocated buffer ready for the stop-copy phase */
- buf = mlx5vf_alloc_data_buffer(migf,
- migf->buf[0]->allocated_length, DMA_FROM_DEVICE);
+ buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages,
+ DMA_FROM_DEVICE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_pd;
@@ -917,11 +893,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
goto out_unlock;
break;
case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
- if (vhca_buf_header->allocated_length < migf->record_size) {
+ {
+ u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE);
+
+ if (vhca_buf_header->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf_header);
- migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
- migf->record_size, DMA_NONE);
+ migf->buf_header[0] = mlx5vf_alloc_data_buffer(
+ migf, npages, DMA_NONE);
if (IS_ERR(migf->buf_header[0])) {
ret = PTR_ERR(migf->buf_header[0]);
migf->buf_header[0] = NULL;
@@ -934,6 +913,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
vhca_buf_header->start_pos = migf->max_pos;
migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
break;
+ }
case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
&buf, &len, pos, &done);
@@ -944,12 +924,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
{
u64 size = max(migf->record_size,
migf->stop_copy_prep_size);
+ u32 npages = DIV_ROUND_UP(size, PAGE_SIZE);
- if (vhca_buf->allocated_length < size) {
+ if (vhca_buf->npages < npages) {
mlx5vf_free_data_buffer(vhca_buf);
- migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
- size, DMA_TO_DEVICE);
+ migf->buf[0] = mlx5vf_alloc_data_buffer(
+ migf, npages, DMA_TO_DEVICE);
if (IS_ERR(migf->buf[0])) {
ret = PTR_ERR(migf->buf[0]);
migf->buf[0] = NULL;
@@ -1037,8 +1018,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
}
migf->buf[0] = buf;
- buf = mlx5vf_alloc_data_buffer(migf,
- sizeof(struct mlx5_vf_migration_header), DMA_NONE);
+ buf = mlx5vf_alloc_data_buffer(
+ migf,
+ DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header),
+ PAGE_SIZE),
+ DMA_NONE);
if (IS_ERR(buf)) {
ret = PTR_ERR(buf);
goto out_buf;
@@ -1148,7 +1132,8 @@ mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
if (ret)
return ERR_PTR(ret);
- buf = mlx5vf_get_data_buffer(migf, size, DMA_FROM_DEVICE);
+ buf = mlx5vf_get_data_buffer(migf,
+ DIV_ROUND_UP(size, PAGE_SIZE), DMA_FROM_DEVICE);
if (IS_ERR(buf))
return ERR_CAST(buf);
/* pre_copy cleanup */
@@ -1387,6 +1372,7 @@ static const struct vfio_device_ops mlx5vf_pci_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -1446,7 +1432,7 @@ static struct pci_driver mlx5vf_pci_driver = {
module_pci_driver(mlx5vf_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index a467085038f0..d95761dcdd58 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -5,6 +5,8 @@
#include <linux/sizes.h>
#include <linux/vfio_pci_core.h>
+#include <linux/delay.h>
+#include <linux/jiffies.h>
/*
* The device memory usable to the workloads running in the VM is cached
@@ -17,12 +19,21 @@
#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
-/* Memory size expected as non cached and reserved by the VM driver */
-#define RESMEM_SIZE SZ_1G
-
/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
#define MEMBLK_SIZE SZ_512M
+#define DVSEC_BITMAP_OFFSET 0xA
+#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
+
+#define GPU_CAP_DVSEC_REGISTER 3
+
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
/*
* The state of the two device memory region - resmem and usemem - is
* saved as struct mem_region.
@@ -46,6 +57,7 @@ struct nvgrace_gpu_pci_core_device {
struct mem_region resmem;
/* Lock to control device memory kernel mapping */
struct mutex remap_lock;
+ bool has_mig_hw_bug;
};
static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
@@ -66,7 +78,7 @@ nvgrace_gpu_memregion(int index,
if (index == USEMEM_REGION_INDEX)
return &nvdev->usemem;
- if (index == RESMEM_REGION_INDEX)
+ if (nvdev->resmem.memlength && index == RESMEM_REGION_INDEX)
return &nvdev->resmem;
return NULL;
@@ -684,6 +696,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
.mmap = nvgrace_gpu_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -703,6 +716,7 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -751,40 +765,67 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
u64 memphys, u64 memlength)
{
int ret = 0;
+ u64 resmem_size = 0;
/*
- * The VM GPU device driver needs a non-cacheable region to support
- * the MIG feature. Since the device memory is mapped as NORMAL cached,
- * carve out a region from the end with a different NORMAL_NC
- * property (called as reserved memory and represented as resmem). This
- * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
- * exposing the rest (termed as usable memory and represented using usemem)
- * as cacheable 64b BAR (region 4 and 5).
+ * On Grace Hopper systems, the VM GPU device driver needs a non-cacheable
+ * region to support the MIG feature owing to a hardware bug. Since the
+ * device memory is mapped as NORMAL cached, carve out a region from the end
+ * with a different NORMAL_NC property (called as reserved memory and
+ * represented as resmem). This region then is exposed as a 64b BAR
+ * (region 2 and 3) to the VM, while exposing the rest (termed as usable
+ * memory and represented using usemem) as cacheable 64b BAR (region 4 and 5).
*
* devmem (memlength)
* |-------------------------------------------------|
* | |
* usemem.memphys resmem.memphys
+ *
+ * This hardware bug is fixed on the Grace Blackwell platforms and the
+ * presence of the bug can be determined through nvdev->has_mig_hw_bug.
+ * Thus on systems with the hardware fix, there is no need to partition
+ * the GPU device memory and the entire memory is usable and mapped as
+ * NORMAL cached (i.e. resmem size is 0).
*/
+ if (nvdev->has_mig_hw_bug)
+ resmem_size = SZ_1G;
+
nvdev->usemem.memphys = memphys;
/*
* The device memory exposed to the VM is added to the kernel by the
- * VM driver module in chunks of memory block size. Only the usable
- * memory (usemem) is added to the kernel for usage by the VM
- * workloads. Make the usable memory size memblock aligned.
+ * VM driver module in chunks of memory block size. Note that only the
+ * usable memory (usemem) is added to the kernel for usage by the VM
+ * workloads.
*/
- if (check_sub_overflow(memlength, RESMEM_SIZE,
+ if (check_sub_overflow(memlength, resmem_size,
&nvdev->usemem.memlength)) {
ret = -EOVERFLOW;
goto done;
}
/*
- * The USEMEM part of the device memory has to be MEMBLK_SIZE
- * aligned. This is a hardwired ABI value between the GPU FW and
- * VFIO driver. The VM device driver is also aware of it and make
- * use of the value for its calculation to determine USEMEM size.
+ * The usemem region is exposed as a 64B Bar composed of region 4 and 5.
+ * Calculate and save the BAR size for the region.
+ */
+ nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+
+ /*
+ * If the hardware has the fix for MIG, there is no requirement
+ * for splitting the device memory to create RESMEM. The entire
+ * device memory is usable and will be USEMEM. Return here for
+ * such case.
+ */
+ if (!nvdev->has_mig_hw_bug)
+ goto done;
+
+ /*
+ * When the device memory is split to workaround the MIG bug on
+ * Grace Hopper, the USEMEM part of the device memory has to be
+ * MEMBLK_SIZE aligned. This is a hardwired ABI value between the
+ * GPU FW and VFIO driver. The VM device driver is also aware of it
+ * and make use of the value for its calculation to determine USEMEM
+ * size. Note that the device memory may not be 512M aligned.
*/
nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
MEMBLK_SIZE);
@@ -803,15 +844,93 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
}
/*
- * The memory regions are exposed as BARs. Calculate and save
- * the BAR size for them.
+ * The resmem region is exposed as a 64b BAR composed of region 2 and 3
+ * for Grace Hopper. Calculate and save the BAR size for the region.
*/
- nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
done:
return ret;
}
+static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev)
+{
+ int pcie_dvsec;
+ u16 dvsec_ctrl16;
+
+ pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
+ GPU_CAP_DVSEC_REGISTER);
+
+ if (pcie_dvsec) {
+ pci_read_config_word(pdev,
+ pcie_dvsec + DVSEC_BITMAP_OFFSET,
+ &dvsec_ctrl16);
+
+ if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ *
+ * Ensure that the BAR0 region is enabled before accessing the
+ * registers.
+ */
+static int nvgrace_gpu_wait_device_ready(struct pci_dev *pdev)
+{
+ unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS);
+ void __iomem *io;
+ int ret = -ETIME;
+
+ ret = pci_enable_device(pdev);
+ if (ret)
+ return ret;
+
+ ret = pci_request_selected_regions(pdev, 1 << 0, KBUILD_MODNAME);
+ if (ret)
+ goto request_region_exit;
+
+ io = pci_iomap(pdev, 0, 0);
+ if (!io) {
+ ret = -ENOMEM;
+ goto iomap_exit;
+ }
+
+ do {
+ if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+ (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+ ret = 0;
+ goto reg_check_exit;
+ }
+ msleep(POLL_QUANTUM_MS);
+ } while (!time_after(jiffies, timeout));
+
+reg_check_exit:
+ pci_iounmap(pdev, io);
+iomap_exit:
+ pci_release_selected_regions(pdev, 1 << 0);
+request_region_exit:
+ pci_disable_device(pdev);
+ return ret;
+}
+
static int nvgrace_gpu_probe(struct pci_dev *pdev,
const struct pci_device_id *id)
{
@@ -820,6 +939,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
u64 memphys, memlength;
int ret;
+ ret = nvgrace_gpu_wait_device_ready(pdev);
+ if (ret)
+ return ret;
+
ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
if (!ret)
ops = &nvgrace_gpu_pci_ops;
@@ -832,6 +955,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
dev_set_drvdata(&pdev->dev, &nvdev->core_device);
if (ops == &nvgrace_gpu_pci_ops) {
+ nvdev->has_mig_hw_bug = nvgrace_gpu_has_mig_hw_bug(pdev);
+
/*
* Device memory properties are identified in the host ACPI
* table. Set the nvgrace_gpu_pci_core_device structure.
@@ -868,6 +993,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
/* GH200 SKU */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
+ /* GB200 SKU */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
{}
};
diff --git a/drivers/vfio/pci/pds/pci_drv.c b/drivers/vfio/pci/pds/pci_drv.c
index 16e93b11ab1b..4923f1823126 100644
--- a/drivers/vfio/pci/pds/pci_drv.c
+++ b/drivers/vfio/pci/pds/pci_drv.c
@@ -187,7 +187,7 @@ static struct pci_driver pds_vfio_pci_driver = {
module_pci_driver(pds_vfio_pci_driver);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_DESCRIPTION(PDS_VFIO_DRV_DESCRIPTION);
MODULE_AUTHOR("Brett Creeley <brett.creeley@amd.com>");
MODULE_LICENSE("GPL");
diff --git a/drivers/vfio/pci/pds/vfio_dev.c b/drivers/vfio/pci/pds/vfio_dev.c
index 76a80ae7087b..f3ccb0008f67 100644
--- a/drivers/vfio/pci/pds/vfio_dev.c
+++ b/drivers/vfio/pci/pds/vfio_dev.c
@@ -201,9 +201,11 @@ static const struct vfio_device_ops pds_vfio_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
+ .detach_ioas = vfio_iommufd_physical_detach_ioas,
};
const struct vfio_device_ops *pds_vfio_ops_info(void)
diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c
index c78cb6de9390..a19b68043eb2 100644
--- a/drivers/vfio/pci/qat/main.c
+++ b/drivers/vfio/pci/qat/main.c
@@ -614,6 +614,7 @@ static const struct vfio_device_ops qat_vf_pci_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -675,6 +676,8 @@ static const struct pci_device_id qat_vf_vfio_pci_table[] = {
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4941) },
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4943) },
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4945) },
+ /* Intel QAT GEN6 6xxx VF device */
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_INTEL, 0x4949) },
{}
};
MODULE_DEVICE_TABLE(pci, qat_vf_vfio_pci_table);
@@ -696,5 +699,5 @@ module_pci_driver(qat_vf_vfio_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Xin Zeng <xin.zeng@intel.com>");
-MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT GEN4 device family");
-MODULE_IMPORT_NS(CRYPTO_QAT);
+MODULE_DESCRIPTION("QAT VFIO PCI - VFIO PCI driver with live migration support for Intel(R) QAT device family");
+MODULE_IMPORT_NS("CRYPTO_QAT");
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index e727941f589d..ac10f14417f2 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -111,9 +111,7 @@ static int vfio_pci_open_device(struct vfio_device *core_vdev)
if (ret)
return ret;
- if (vfio_pci_is_vga(pdev) &&
- pdev->vendor == PCI_VENDOR_ID_INTEL &&
- IS_ENABLED(CONFIG_VFIO_PCI_IGD)) {
+ if (vfio_pci_is_intel_display(pdev)) {
ret = vfio_pci_igd_init(vdev);
if (ret && ret != -ENODEV) {
pci_warn(pdev, "Failed to setup Intel IGD regions\n");
@@ -140,10 +138,13 @@ static const struct vfio_device_ops vfio_pci_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
.detach_ioas = vfio_iommufd_physical_detach_ioas,
+ .pasid_attach_ioas = vfio_iommufd_physical_pasid_attach_ioas,
+ .pasid_detach_ioas = vfio_iommufd_physical_pasid_detach_ioas,
};
static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index ea2745c1ac5e..8f02f236b5b4 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -511,13 +511,13 @@ static void vfio_bar_fixup(struct vfio_pci_core_device *vdev)
mask = ~(pci_resource_len(pdev, PCI_ROM_RESOURCE) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW) {
- mask = ~(0x20000 - 1);
+ } else if (pdev->rom && pdev->romlen) {
+ mask = ~(roundup_pow_of_two(pdev->romlen) - 1);
mask |= PCI_ROM_ADDRESS_ENABLE;
*vbar &= cpu_to_le32((u32)mask);
- } else
+ } else {
*vbar = 0;
+ }
vdev->bardirty = false;
}
@@ -1389,11 +1389,12 @@ static int vfio_ext_cap_len(struct vfio_pci_core_device *vdev, u16 ecap, u16 epo
switch (ecap) {
case PCI_EXT_CAP_ID_VNDR:
- ret = pci_read_config_dword(pdev, epos + PCI_VSEC_HDR, &dword);
+ ret = pci_read_config_dword(pdev, epos + PCI_VNDR_HEADER,
+ &dword);
if (ret)
return pcibios_err_to_errno(ret);
- return dword >> PCI_VSEC_HDR_LEN_SHIFT;
+ return PCI_VNDR_HEADER_LEN(dword);
case PCI_EXT_CAP_ID_VC:
case PCI_EXT_CAP_ID_VC9:
case PCI_EXT_CAP_ID_MFVC:
@@ -1813,7 +1814,8 @@ int vfio_config_init(struct vfio_pci_core_device *vdev)
cpu_to_le16(PCI_COMMAND_MEMORY);
}
- if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx)
+ if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx ||
+ !vdev->pdev->irq || vdev->pdev->irq == IRQ_NOTCONNECTED)
vconfig[PCI_INTERRUPT_PIN] = 0;
ret = vfio_cap_init(vdev);
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1ab58da9f38a..7dcf5439dedc 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -20,7 +20,6 @@
#include <linux/mutex.h>
#include <linux/notifier.h>
#include <linux/pci.h>
-#include <linux/pfn_t.h>
#include <linux/pm_runtime.h>
#include <linux/slab.h>
#include <linux/types.h>
@@ -116,7 +115,7 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev)
res = &vdev->pdev->resource[bar];
- if (!IS_ENABLED(CONFIG_VFIO_PCI_MMAP))
+ if (vdev->pdev->non_mappable_bars)
goto no_mmap;
if (!(res->flags & IORESOURCE_MEM))
@@ -727,15 +726,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable);
static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type)
{
if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
- u8 pin;
-
- if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) ||
- vdev->nointx || vdev->pdev->is_virtfn)
- return 0;
-
- pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
-
- return pin ? 1 : 0;
+ return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0;
} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
u8 pos;
u16 flags;
@@ -1054,31 +1045,27 @@ static int vfio_pci_ioctl_get_region_info(struct vfio_pci_core_device *vdev,
info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
info.flags = 0;
+ info.size = 0;
- /* Report the BAR size, not the ROM size */
- info.size = pci_resource_len(pdev, info.index);
- if (!info.size) {
- /* Shadow ROMs appear as PCI option ROMs */
- if (pdev->resource[PCI_ROM_RESOURCE].flags &
- IORESOURCE_ROM_SHADOW)
- info.size = 0x20000;
- else
- break;
- }
-
- /*
- * Is it really there? Enable memory decode for implicit access
- * in pci_map_rom().
- */
- cmd = vfio_pci_memory_lock_and_enable(vdev);
- io = pci_map_rom(pdev, &size);
- if (io) {
+ if (pci_resource_start(pdev, PCI_ROM_RESOURCE)) {
+ /*
+ * Check ROM content is valid. Need to enable memory
+ * decode for ROM access in pci_map_rom().
+ */
+ cmd = vfio_pci_memory_lock_and_enable(vdev);
+ io = pci_map_rom(pdev, &size);
+ if (io) {
+ info.flags = VFIO_REGION_INFO_FLAG_READ;
+ /* Report the BAR size, not the ROM size. */
+ info.size = pci_resource_len(pdev, PCI_ROM_RESOURCE);
+ pci_unmap_rom(pdev, io);
+ }
+ vfio_pci_memory_unlock_and_restore(vdev, cmd);
+ } else if (pdev->rom && pdev->romlen) {
info.flags = VFIO_REGION_INFO_FLAG_READ;
- pci_unmap_rom(pdev, io);
- } else {
- info.size = 0;
+ /* Report BAR size as power of two. */
+ info.size = roundup_pow_of_two(pdev->romlen);
}
- vfio_pci_memory_unlock_and_restore(vdev, cmd);
break;
}
@@ -1658,17 +1645,18 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
{
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
+ unsigned long addr = vmf->address & ~((PAGE_SIZE << order) - 1);
+ unsigned long pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+ unsigned long pfn = vma_to_pfn(vma) + pgoff;
vm_fault_t ret = VM_FAULT_SIGBUS;
- if (order && (vmf->address & ((PAGE_SIZE << order) - 1) ||
- vmf->address + (PAGE_SIZE << order) > vma->vm_end)) {
+ if (order && (addr < vma->vm_start ||
+ addr + (PAGE_SIZE << order) > vma->vm_end ||
+ pfn & ((1 << order) - 1))) {
ret = VM_FAULT_FALLBACK;
goto out;
}
- pfn = vma_to_pfn(vma);
-
down_read(&vdev->memory_lock);
if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
@@ -1676,18 +1664,16 @@ static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
switch (order) {
case 0:
- ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+ ret = vmf_insert_pfn(vma, vmf->address, pfn);
break;
#ifdef CONFIG_ARCH_SUPPORTS_PMD_PFNMAP
case PMD_ORDER:
- ret = vmf_insert_pfn_pmd(vmf, __pfn_to_pfn_t(pfn + pgoff,
- PFN_DEV), false);
+ ret = vmf_insert_pfn_pmd(vmf, pfn, false);
break;
#endif
#ifdef CONFIG_ARCH_SUPPORTS_PUD_PFNMAP
case PUD_ORDER:
- ret = vmf_insert_pfn_pud(vmf, __pfn_to_pfn_t(pfn + pgoff,
- PFN_DEV), false);
+ ret = vmf_insert_pfn_pud(vmf, pfn, false);
break;
#endif
default:
@@ -1832,9 +1818,13 @@ void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count)
}
EXPORT_SYMBOL_GPL(vfio_pci_core_request);
-static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
- bool vf_token, uuid_t *uuid)
+int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev,
+ const uuid_t *uuid)
+
{
+ struct vfio_pci_core_device *vdev =
+ container_of(core_vdev, struct vfio_pci_core_device, vdev);
+
/*
* There's always some degree of trust or collaboration between SR-IOV
* PF and VFs, even if just that the PF hosts the SR-IOV capability and
@@ -1865,7 +1855,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
bool match;
if (!pf_vdev) {
- if (!vf_token)
+ if (!uuid)
return 0; /* PF is not vfio-pci, no VF token */
pci_info_ratelimited(vdev->pdev,
@@ -1873,7 +1863,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
return -EINVAL;
}
- if (!vf_token) {
+ if (!uuid) {
pci_info_ratelimited(vdev->pdev,
"VF token required to access device\n");
return -EACCES;
@@ -1891,7 +1881,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
} else if (vdev->vf_token) {
mutex_lock(&vdev->vf_token->lock);
if (vdev->vf_token->users) {
- if (!vf_token) {
+ if (!uuid) {
mutex_unlock(&vdev->vf_token->lock);
pci_info_ratelimited(vdev->pdev,
"VF token required to access device\n");
@@ -1904,12 +1894,12 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
"Incorrect VF token provided for device\n");
return -EACCES;
}
- } else if (vf_token) {
+ } else if (uuid) {
uuid_copy(&vdev->vf_token->uuid, uuid);
}
mutex_unlock(&vdev->vf_token->lock);
- } else if (vf_token) {
+ } else if (uuid) {
pci_info_ratelimited(vdev->pdev,
"VF token incorrectly provided, not a PF or VF\n");
return -EINVAL;
@@ -1917,6 +1907,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_core_device *vdev,
return 0;
}
+EXPORT_SYMBOL_GPL(vfio_pci_core_match_token_uuid);
#define VF_TOKEN_ARG "vf_token="
@@ -1963,7 +1954,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf)
}
}
- ret = vfio_pci_validate_vf_token(vdev, vf_token, &uuid);
+ ret = core_vdev->ops->match_token_uuid(core_vdev,
+ vf_token ? &uuid : NULL);
if (ret)
return ret;
@@ -2160,7 +2152,7 @@ int vfio_pci_core_register_device(struct vfio_pci_core_device *vdev)
return -EBUSY;
}
- if (pci_is_root_bus(pdev->bus)) {
+ if (pci_is_root_bus(pdev->bus) || pdev->is_virtfn) {
ret = vfio_assign_device_set(&vdev->vdev, vdev);
} else if (!pci_probe_reset_slot(pdev->slot)) {
ret = vfio_assign_device_set(&vdev->vdev, pdev->slot);
diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c
index dd70e2431bd7..988b6919c2c3 100644
--- a/drivers/vfio/pci/vfio_pci_igd.c
+++ b/drivers/vfio/pci/vfio_pci_igd.c
@@ -435,6 +435,11 @@ static int vfio_pci_igd_cfg_init(struct vfio_pci_core_device *vdev)
return 0;
}
+bool vfio_pci_is_intel_display(struct pci_dev *pdev)
+{
+ return (pdev->vendor == PCI_VENDOR_ID_INTEL) && pci_is_display(pdev);
+}
+
int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
{
int ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 8382c5834335..123298a4dc8f 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -259,7 +259,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev,
if (!is_irq_none(vdev))
return -EINVAL;
- if (!pdev->irq)
+ if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED)
return -ENODEV;
name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev));
@@ -505,15 +505,11 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev,
if (ret)
goto out_put_eventfd_ctx;
- ctx->producer.token = trigger;
- ctx->producer.irq = irq;
- ret = irq_bypass_register_producer(&ctx->producer);
+ ret = irq_bypass_register_producer(&ctx->producer, trigger, irq);
if (unlikely(ret)) {
dev_info(&pdev->dev,
- "irq bypass producer (token %p) registration fails: %d\n",
- ctx->producer.token, ret);
-
- ctx->producer.token = NULL;
+ "irq bypass producer (eventfd %p) registration fails: %d\n",
+ trigger, ret);
}
ctx->trigger = trigger;
diff --git a/drivers/vfio/pci/vfio_pci_priv.h b/drivers/vfio/pci/vfio_pci_priv.h
index 5e4fa69aee16..a9972eacb293 100644
--- a/drivers/vfio/pci/vfio_pci_priv.h
+++ b/drivers/vfio/pci/vfio_pci_priv.h
@@ -67,8 +67,14 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev,
u16 cmd);
#ifdef CONFIG_VFIO_PCI_IGD
+bool vfio_pci_is_intel_display(struct pci_dev *pdev);
int vfio_pci_igd_init(struct vfio_pci_core_device *vdev);
#else
+static inline bool vfio_pci_is_intel_display(struct pci_dev *pdev)
+{
+ return false;
+}
+
static inline int vfio_pci_igd_init(struct vfio_pci_core_device *vdev)
{
return -ENODEV;
diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 66b72c289284..6192788c8ba3 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -16,6 +16,7 @@
#include <linux/io.h>
#include <linux/vfio.h>
#include <linux/vgaarb.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
#include "vfio_pci_priv.h"
@@ -61,9 +62,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_iowrite##size);
VFIO_IOWRITE(8)
VFIO_IOWRITE(16)
VFIO_IOWRITE(32)
-#ifdef iowrite64
VFIO_IOWRITE(64)
-#endif
#define VFIO_IOREAD(size) \
int vfio_pci_core_ioread##size(struct vfio_pci_core_device *vdev, \
@@ -89,9 +88,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_ioread##size);
VFIO_IOREAD(8)
VFIO_IOREAD(16)
VFIO_IOREAD(32)
-#ifdef ioread64
VFIO_IOREAD(64)
-#endif
#define VFIO_IORDWR(size) \
static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
@@ -127,9 +124,7 @@ static int vfio_pci_iordwr##size(struct vfio_pci_core_device *vdev,\
VFIO_IORDWR(8)
VFIO_IORDWR(16)
VFIO_IORDWR(32)
-#if defined(ioread64) && defined(iowrite64)
VFIO_IORDWR(64)
-#endif
/*
* Read or write from an __iomem region (MMIO or I/O port) with an excluded
@@ -155,7 +150,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
else
fillable = 0;
-#if defined(ioread64) && defined(iowrite64)
if (fillable >= 8 && !(off % 8)) {
ret = vfio_pci_iordwr64(vdev, iswrite, test_mem,
io, buf, off, &filled);
@@ -163,7 +157,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
return ret;
} else
-#endif
if (fillable >= 4 && !(off % 4)) {
ret = vfio_pci_iordwr32(vdev, iswrite, test_mem,
io, buf, off, &filled);
@@ -244,9 +237,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (pci_resource_start(pdev, bar))
end = pci_resource_len(pdev, bar);
- else if (bar == PCI_ROM_RESOURCE &&
- pdev->resource[bar].flags & IORESOURCE_ROM_SHADOW)
- end = 0x20000;
+ else if (bar == PCI_ROM_RESOURCE && pdev->rom && pdev->romlen)
+ end = roundup_pow_of_two(pdev->romlen);
else
return -EINVAL;
@@ -261,11 +253,14 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
* excluded range at the end of the actual ROM. This makes
* filling large ROM BARs much faster.
*/
- io = pci_map_rom(pdev, &x_start);
- if (!io) {
- done = -ENOMEM;
- goto out;
+ if (pci_resource_start(pdev, bar)) {
+ io = pci_map_rom(pdev, &x_start);
+ } else {
+ io = ioremap(pdev->rom, pdev->romlen);
+ x_start = pdev->romlen;
}
+ if (!io)
+ return -ENOMEM;
x_end = end;
} else {
int ret = vfio_pci_core_setup_barmap(vdev, bar);
@@ -288,8 +283,13 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
if (done >= 0)
*ppos += done;
- if (bar == PCI_ROM_RESOURCE)
- pci_unmap_rom(pdev, io);
+ if (bar == PCI_ROM_RESOURCE) {
+ if (pci_resource_start(pdev, bar))
+ pci_unmap_rom(pdev, io);
+ else
+ iounmap(io);
+ }
+
out:
return done;
}
@@ -381,12 +381,10 @@ static void vfio_pci_ioeventfd_do_write(struct vfio_pci_ioeventfd *ioeventfd,
vfio_pci_core_iowrite32(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#ifdef iowrite64
case 8:
vfio_pci_core_iowrite64(ioeventfd->vdev, test_mem,
ioeventfd->data, ioeventfd->addr);
break;
-#endif
}
}
@@ -440,10 +438,8 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
pos >= vdev->msix_offset + vdev->msix_size))
return -EINVAL;
-#ifndef iowrite64
if (count == 8)
return -EINVAL;
-#endif
ret = vfio_pci_core_setup_barmap(vdev, bar);
if (ret)
diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig
index 2770f7eb702c..33e04e65bec6 100644
--- a/drivers/vfio/pci/virtio/Kconfig
+++ b/drivers/vfio/pci/virtio/Kconfig
@@ -1,11 +1,11 @@
# SPDX-License-Identifier: GPL-2.0-only
config VIRTIO_VFIO_PCI
- tristate "VFIO support for VIRTIO NET PCI VF devices"
+ tristate "VFIO support for VIRTIO PCI VF devices"
depends on VIRTIO_PCI
select VFIO_PCI_CORE
help
- This provides migration support for VIRTIO NET PCI VF devices
- using the VFIO framework. Migration support requires the
+ This provides migration support for VIRTIO NET and BLOCK PCI VF
+ devices using the VFIO framework. Migration support requires the
SR-IOV PF device to support specific VIRTIO extensions,
otherwise this driver provides no additional functionality
beyond vfio-pci.
diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c
index 20382ee15fac..832af5ba267c 100644
--- a/drivers/vfio/pci/virtio/legacy_io.c
+++ b/drivers/vfio/pci/virtio/legacy_io.c
@@ -382,7 +382,9 @@ static bool virtiovf_bar0_exists(struct pci_dev *pdev)
bool virtiovf_support_legacy_io(struct pci_dev *pdev)
{
- return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev);
+ /* For now, the legacy IO functionality is supported only for virtio-net */
+ return pdev->device == 0x1041 && virtio_pci_admin_has_legacy_io(pdev) &&
+ !virtiovf_bar0_exists(pdev);
}
int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev)
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d534d48c4163..8084f3e36a9f 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -94,6 +94,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -114,6 +115,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -134,6 +136,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = {
.mmap = vfio_pci_core_mmap,
.request = vfio_pci_core_request,
.match = vfio_pci_core_match,
+ .match_token_uuid = vfio_pci_core_match_token_uuid,
.bind_iommufd = vfio_iommufd_physical_bind,
.unbind_iommufd = vfio_iommufd_physical_unbind,
.attach_ioas = vfio_iommufd_physical_attach_ioas,
@@ -187,8 +190,9 @@ static void virtiovf_pci_remove(struct pci_dev *pdev)
}
static const struct pci_device_id virtiovf_pci_table[] = {
- /* Only virtio-net is supported/tested so far */
+ /* Only virtio-net and virtio-block are supported/tested so far */
{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1041) },
+ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_REDHAT_QUMRANET, 0x1042) },
{}
};
@@ -221,4 +225,4 @@ module_pci_driver(virtiovf_pci_driver);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
MODULE_DESCRIPTION(
- "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET devices");
+ "VIRTIO VFIO PCI - User Level meta-driver for VIRTIO NET and BLOCK devices");
diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c
index ee54f4c17857..ba92bb4e9af9 100644
--- a/drivers/vfio/pci/virtio/migrate.c
+++ b/drivers/vfio/pci/virtio/migrate.c
@@ -77,8 +77,8 @@ static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf,
return -ENOMEM;
do {
- filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
- page_list);
+ filled = alloc_pages_bulk(GFP_KERNEL_ACCOUNT, to_fill,
+ page_list);
if (!filled) {
ret = -ENOMEM;
goto err;
@@ -112,7 +112,7 @@ static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf)
{
struct sg_page_iter sg_iter;
- /* Undo alloc_pages_bulk_array() */
+ /* Undo alloc_pages_bulk() */
for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
__free_page(sg_page_iter_page(&sg_iter));
sg_free_append_table(&buf->table);
diff --git a/drivers/vfio/platform/vfio_platform.c b/drivers/vfio/platform/vfio_platform.c
index 42d1462c5e19..512533501eb7 100644
--- a/drivers/vfio/platform/vfio_platform.c
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -112,7 +112,7 @@ static const struct vfio_device_ops vfio_platform_ops = {
static struct platform_driver vfio_platform_driver = {
.probe = vfio_platform_probe,
- .remove_new = vfio_platform_remove,
+ .remove = vfio_platform_remove,
.driver = {
.name = "vfio-platform",
},
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c
index e53757d1d095..3bf1043cd795 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -388,6 +388,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
@@ -467,6 +472,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg,
{
unsigned int done = 0;
+ if (off >= reg->size)
+ return -EINVAL;
+
+ count = min_t(size_t, count, reg->size - off);
+
if (!reg->ioaddr) {
reg->ioaddr =
ioremap(reg->addr, reg->size);
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 50ebc9593c9d..f8d68fe77b41 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -80,7 +80,6 @@ struct vfio_domain {
struct iommu_domain *domain;
struct list_head next;
struct list_head group_list;
- bool fgsp : 1; /* Fine-grained super pages */
bool enforce_cache_coherency : 1;
};
@@ -103,9 +102,9 @@ struct vfio_dma {
struct vfio_batch {
struct page **pages; /* for pin_user_pages_remote */
struct page *fallback_page; /* if pages alloc fails */
- int capacity; /* length of pages array */
- int size; /* of batch currently */
- int offset; /* of next entry in pages */
+ unsigned int capacity; /* length of pages array */
+ unsigned int size; /* of batch currently */
+ unsigned int offset; /* of next entry in pages */
};
struct vfio_iommu_group {
@@ -293,7 +292,7 @@ static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
struct rb_node *p;
for (p = rb_prev(n); p; p = rb_prev(p)) {
- struct vfio_dma *dma = rb_entry(n,
+ struct vfio_dma *dma = rb_entry(p,
struct vfio_dma, node);
vfio_dma_bitmap_free(dma);
@@ -471,12 +470,12 @@ static int put_pfn(unsigned long pfn, int prot)
#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
-static void vfio_batch_init(struct vfio_batch *batch)
+static void __vfio_batch_init(struct vfio_batch *batch, bool single)
{
batch->size = 0;
batch->offset = 0;
- if (unlikely(disable_hugepages))
+ if (single || unlikely(disable_hugepages))
goto fallback;
batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
@@ -491,6 +490,16 @@ fallback:
batch->capacity = 1;
}
+static void vfio_batch_init(struct vfio_batch *batch)
+{
+ __vfio_batch_init(batch, false);
+}
+
+static void vfio_batch_init_single(struct vfio_batch *batch)
+{
+ __vfio_batch_init(batch, true);
+}
+
static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
{
while (batch->size) {
@@ -510,7 +519,7 @@ static void vfio_batch_fini(struct vfio_batch *batch)
static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
unsigned long vaddr, unsigned long *pfn,
- bool write_fault)
+ unsigned long *addr_mask, bool write_fault)
{
struct follow_pfnmap_args args = { .vma = vma, .address = vaddr };
int ret;
@@ -534,10 +543,12 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
return ret;
}
- if (write_fault && !args.writable)
+ if (write_fault && !args.writable) {
ret = -EFAULT;
- else
+ } else {
*pfn = args.pfn;
+ *addr_mask = args.addr_mask;
+ }
follow_pfnmap_end(&args);
return ret;
@@ -545,25 +556,33 @@ static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
/*
* Returns the positive number of pfns successfully obtained or a negative
- * error code.
+ * error code. The initial pfn is stored in the pfn arg. For page-backed
+ * pfns, the provided batch is also updated to indicate the filled pages and
+ * initial offset. For VM_PFNMAP pfns, only the returned number of pfns and
+ * returned initial pfn are provided; subsequent pfns are contiguous.
*/
-static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
- long npages, int prot, unsigned long *pfn,
- struct page **pages)
+static long vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
+ unsigned long npages, int prot, unsigned long *pfn,
+ struct vfio_batch *batch)
{
+ unsigned long pin_pages = min_t(unsigned long, npages, batch->capacity);
struct vm_area_struct *vma;
unsigned int flags = 0;
- int ret;
+ long ret;
if (prot & IOMMU_WRITE)
flags |= FOLL_WRITE;
mmap_read_lock(mm);
- ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
- pages, NULL);
+ ret = pin_user_pages_remote(mm, vaddr, pin_pages, flags | FOLL_LONGTERM,
+ batch->pages, NULL);
if (ret > 0) {
- *pfn = page_to_pfn(pages[0]);
+ *pfn = page_to_pfn(batch->pages[0]);
+ batch->size = ret;
+ batch->offset = 0;
goto done;
+ } else if (!ret) {
+ ret = -EFAULT;
}
vaddr = untagged_addr_remote(mm, vaddr);
@@ -572,15 +591,22 @@ retry:
vma = vma_lookup(mm, vaddr);
if (vma && vma->vm_flags & VM_PFNMAP) {
- ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
+ unsigned long addr_mask;
+
+ ret = follow_fault_pfn(vma, mm, vaddr, pfn, &addr_mask,
+ prot & IOMMU_WRITE);
if (ret == -EAGAIN)
goto retry;
if (!ret) {
- if (is_invalid_reserved_pfn(*pfn))
- ret = 1;
- else
+ if (is_invalid_reserved_pfn(*pfn)) {
+ unsigned long epfn;
+
+ epfn = (*pfn | (~addr_mask >> PAGE_SHIFT)) + 1;
+ ret = min_t(long, npages, epfn - *pfn);
+ } else {
ret = -EFAULT;
+ }
}
}
done:
@@ -594,7 +620,7 @@ done:
* first page and all consecutive pages with the same locking.
*/
static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
- long npage, unsigned long *pfn_base,
+ unsigned long npage, unsigned long *pfn_base,
unsigned long limit, struct vfio_batch *batch)
{
unsigned long pfn;
@@ -616,32 +642,49 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
*pfn_base = 0;
}
+ if (unlikely(disable_hugepages))
+ npage = 1;
+
while (npage) {
if (!batch->size) {
- /* Empty batch, so refill it. */
- long req_pages = min_t(long, npage, batch->capacity);
+ /*
+ * Large mappings may take a while to repeatedly refill
+ * the batch, so conditionally relinquish the CPU when
+ * needed to avoid stalls.
+ */
+ cond_resched();
- ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
- &pfn, batch->pages);
+ /* Empty batch, so refill it. */
+ ret = vaddr_get_pfns(mm, vaddr, npage, dma->prot,
+ &pfn, batch);
if (ret < 0)
goto unpin_out;
- batch->size = ret;
- batch->offset = 0;
-
if (!*pfn_base) {
*pfn_base = pfn;
rsvd = is_invalid_reserved_pfn(*pfn_base);
}
+
+ /* Handle pfnmap */
+ if (!batch->size) {
+ if (pfn != *pfn_base + pinned || !rsvd)
+ goto out;
+
+ pinned += ret;
+ npage -= ret;
+ vaddr += (PAGE_SIZE * ret);
+ iova += (PAGE_SIZE * ret);
+ continue;
+ }
}
/*
- * pfn is preset for the first iteration of this inner loop and
- * updated at the end to handle a VM_PFNMAP pfn. In that case,
- * batch->pages isn't valid (there's no struct page), so allow
- * batch->pages to be touched only when there's more than one
- * pfn to check, which guarantees the pfns are from a
- * !VM_PFNMAP vma.
+ * pfn is preset for the first iteration of this inner loop
+ * due to the fact that vaddr_get_pfns() needs to provide the
+ * initial pfn for pfnmaps. Therefore to reduce redundancy,
+ * the next pfn is fetched at the end of the loop.
+ * A PageReserved() page could still qualify as page backed
+ * and rsvd here, and therefore continues to use the batch.
*/
while (true) {
if (pfn != *pfn_base + pinned ||
@@ -676,21 +719,12 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
pfn = page_to_pfn(batch->pages[batch->offset]);
}
-
- if (unlikely(disable_hugepages))
- break;
}
out:
ret = vfio_lock_acct(dma, lock_acct, false);
unpin_out:
- if (batch->size == 1 && !batch->offset) {
- /* May be a VM_PFNMAP pfn, which the batch can't remember. */
- put_pfn(pfn, dma->prot);
- batch->size = 0;
- }
-
if (ret < 0) {
if (pinned && !rsvd) {
for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
@@ -705,7 +739,7 @@ unpin_out:
}
static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
- unsigned long pfn, long npage,
+ unsigned long pfn, unsigned long npage,
bool do_accounting)
{
long unlocked = 0, locked = 0;
@@ -728,7 +762,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
unsigned long *pfn_base, bool do_accounting)
{
- struct page *pages[1];
+ struct vfio_batch batch;
struct mm_struct *mm;
int ret;
@@ -736,7 +770,9 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
if (!mmget_not_zero(mm))
return -ENODEV;
- ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
+ vfio_batch_init_single(&batch);
+
+ ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, &batch);
if (ret != 1)
goto out;
@@ -755,6 +791,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
}
out:
+ vfio_batch_fini(&batch);
mmput(mm);
return ret;
}
@@ -1064,8 +1101,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
* may require hardware cache flushing, try to find the
* largest contiguous physical memory chunk to unmap.
*/
- for (len = PAGE_SIZE;
- !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
+ for (len = PAGE_SIZE; iova + len < end; len += PAGE_SIZE) {
next = iommu_iova_to_phys(domain->domain, iova + len);
if (next != phys + len)
break;
@@ -1802,49 +1838,6 @@ unwind:
return ret;
}
-/*
- * We change our unmap behavior slightly depending on whether the IOMMU
- * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
- * for practically any contiguous power-of-two mapping we give it. This means
- * we don't need to look for contiguous chunks ourselves to make unmapping
- * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
- * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
- * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
- * hugetlbfs is in use.
- */
-static void vfio_test_domain_fgsp(struct vfio_domain *domain, struct list_head *regions)
-{
- int ret, order = get_order(PAGE_SIZE * 2);
- struct vfio_iova *region;
- struct page *pages;
- dma_addr_t start;
-
- pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
- if (!pages)
- return;
-
- list_for_each_entry(region, regions, list) {
- start = ALIGN(region->start, PAGE_SIZE * 2);
- if (start >= region->end || (region->end - start < PAGE_SIZE * 2))
- continue;
-
- ret = iommu_map(domain->domain, start, page_to_phys(pages), PAGE_SIZE * 2,
- IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE,
- GFP_KERNEL_ACCOUNT);
- if (!ret) {
- size_t unmapped = iommu_unmap(domain->domain, start, PAGE_SIZE);
-
- if (unmapped == PAGE_SIZE)
- iommu_unmap(domain->domain, start + PAGE_SIZE, PAGE_SIZE);
- else
- domain->fgsp = true;
- }
- break;
- }
-
- __free_pages(pages, order);
-}
-
static struct vfio_iommu_group *find_iommu_group(struct vfio_domain *domain,
struct iommu_group *iommu_group)
{
@@ -2283,8 +2276,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
}
}
- vfio_test_domain_fgsp(domain, &iova_copy);
-
/* replay mappings on new domains */
ret = vfio_iommu_replay(iommu, domain);
if (ret)
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index a5a62d9d963f..5046cae05222 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -583,7 +583,8 @@ void vfio_df_close(struct vfio_device_file *df)
lockdep_assert_held(&device->dev_set->lock);
- vfio_assert_device_open(device);
+ if (!vfio_assert_device_open(device))
+ return;
if (device->open_count == 1)
vfio_df_device_last_close(df);
device->open_count--;
@@ -1751,7 +1752,7 @@ static void __exit vfio_cleanup(void)
module_init(vfio_init);
module_exit(vfio_cleanup);
-MODULE_IMPORT_NS(IOMMUFD);
+MODULE_IMPORT_NS("IOMMUFD");
MODULE_VERSION(DRIVER_VERSION);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR(DRIVER_AUTHOR);