diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 21:42:17 +0300 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-17 21:42:17 +0300 |
| commit | 3dc0df03396a3329c644b29b421892a32ecb9387 (patch) | |
| tree | 5d5d60bb9b4e5923d740046b0e2d7f67bbca694d | |
| parent | 407ce27e7417646b5476d71166657ca7eac189ec (diff) | |
| parent | 785562e31dbcd85ca583cf58c446e63aa8a5af08 (diff) | |
| download | linux-3dc0df03396a3329c644b29b421892a32ecb9387.tar.xz | |
Merge tag 'vfio-v7.2-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Fix out-of-tree vfio selftest builds with make O= (Jason Gunthorpe)
- Allow vfio selftests to build when ARCH=x86 is used for 64-bit x86
builds (David Matlack)
- Tighten vfio selftest infrastructure with stricter builds, safer path
handling, sysfs helpers, and reusable device/VF-token setup. Build on
that to add the SR-IOV UAPI selftest across supported IOMMU modes
(Raghavendra Rao Ananta)
- Conclude earlier vfio PCI BAR work already taken as v7.1 fixes by
replacing vfio_pci_core_setup_barmap() and direct barmap[] access
with vfio_pci_core_get_iomap(). Fix resulting sparse warnings (Matt
Evans)
- Simplify hisi_acc vfio-pci variant driver device-info reads by using
the mailbox's new direct command-based read helper (Weili Qian)
- Avoid duplicate reset handling in the Xe vfio-pci variant driver
reset-done path (GuoHan Zhao)
- Resolve a lockdep circular dependency splat by tracking active VFs
with a private sriov_active flag rather than calling pci_num_vf()
under memory_lock (Raghavendra Rao Ananta)
- Add CXL DVSEC-based readiness polling for Blackwell-Next in the
nvgrace-gpu vfio-pci variant driver, including interruptible,
lockless waits to support worst case spec defined timeouts (Ankit
Agrawal)
- Prevent vfio_mig_get_next_state() from spinning forever on blocked
migration state transition (Junrui Luo)
- Fix a qat vfio variant driver migration resume race by taking the
migration file lock before boundary checks (Giovanni Cabiddu)
- Add explicit dependencies between vfio selftest output object files
and output directories to ensure directories are always created
(David Matlack)
* tag 'vfio-v7.2-rc1' of https://github.com/awilliam/linux-vfio:
vfio: selftests: Ensure libvfio output dirs are always created
vfio/qat: fix f_pos race in qat_vf_resume_write()
vfio: prevent infinite loop in vfio_mig_get_next_state() on blocked arc
vfio/nvgrace-gpu: Add Blackwell-Next GPU readiness check via CXL DVSEC
vfio/pci: Use a private flag to prevent power state change with VFs
vfio/pci: Fix sparse warning in vfio_pci_core_get_iomap()
vfio/xe: avoid duplicate reset in xe_vfio_pci_reset_done
hisi_acc_vfio_pci: simplify the command for reading device information
vfio/pci: Replace vfio_pci_core_setup_barmap() with vfio_pci_core_get_iomap()
vfio: selftests: Add tests to validate SR-IOV UAPI
vfio: selftests: Add helpers to alloc/free vfio_pci_device
vfio: selftests: Add helper to set/override a vf_token
vfio: selftests: Expose more vfio_pci_device functions
vfio: selftests: Extend container/iommufd setup for passing vf_token
vfio: selftests: Introduce a sysfs lib
vfio: selftests: Introduce snprintf_assert()
vfio: selftests: Add -Wall and -Werror to the Makefile
vfio: selftests: Allow builds when ARCH=x86
vfio: selftests: Fix out-of-tree build with make O=
22 files changed, 775 insertions, 158 deletions
diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index bb121f635b9f..86362ec424a5 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -81,13 +81,10 @@ static int qm_get_vft(struct hisi_qm *qm, u32 *base) u32 qp_num; int ret; - ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_VFT_V2, 0, 0, 1); + ret = hisi_qm_mb_read(qm, &sqc_vft, QM_MB_CMD_SQC_VFT_V2, 0); if (ret) return ret; - sqc_vft = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | - ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << - QM_XQC_ADDR_OFFSET); *base = QM_SQC_VFT_BASE_MASK_V2 & (sqc_vft >> QM_SQC_VFT_BASE_SHIFT_V2); qp_num = (QM_SQC_VFT_NUM_MASK_V2 & (sqc_vft >> QM_SQC_VFT_NUM_SHIFT_V2)) + 1; @@ -95,36 +92,6 @@ static int qm_get_vft(struct hisi_qm *qm, u32 *base) return qp_num; } -static int qm_get_sqc(struct hisi_qm *qm, u64 *addr) -{ - int ret; - - ret = hisi_qm_mb(qm, QM_MB_CMD_SQC_BT, 0, 0, 1); - if (ret) - return ret; - - *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | - ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << - QM_XQC_ADDR_OFFSET); - - return 0; -} - -static int qm_get_cqc(struct hisi_qm *qm, u64 *addr) -{ - int ret; - - ret = hisi_qm_mb(qm, QM_MB_CMD_CQC_BT, 0, 0, 1); - if (ret) - return ret; - - *addr = readl(qm->io_base + QM_MB_CMD_DATA_ADDR_L) | - ((u64)readl(qm->io_base + QM_MB_CMD_DATA_ADDR_H) << - QM_XQC_ADDR_OFFSET); - - return 0; -} - static void qm_xqc_reg_offsets(struct hisi_qm *qm, u32 *eqc_addr, u32 *aeqc_addr) { @@ -575,13 +542,13 @@ static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) vf_data->aeqe_dma |= vf_data->qm_aeqc_dw[QM_XQC_ADDR_LOW]; /* Through SQC_BT/CQC_BT to get sqc and cqc address */ - ret = qm_get_sqc(vf_qm, &vf_data->sqc_dma); + ret = hisi_qm_mb_read(vf_qm, &vf_data->sqc_dma, QM_MB_CMD_SQC_BT, 0); if (ret) { dev_err(dev, "failed to read SQC addr!\n"); return ret; } - ret = qm_get_cqc(vf_qm, &vf_data->cqc_dma); + ret = hisi_qm_mb_read(vf_qm, &vf_data->cqc_dma, QM_MB_CMD_CQC_BT, 0); if (ret) { dev_err(dev, "failed to read CQC addr!\n"); return ret; diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index fa056b69f899..d07dcacb76bd 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -3,10 +3,13 @@ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved */ +#include <linux/bitfield.h> #include <linux/sizes.h> +#include <linux/time64.h> #include <linux/vfio_pci_core.h> #include <linux/delay.h> #include <linux/jiffies.h> +#include <linux/sched.h> #include <linux/pci-p2pdma.h> #include <linux/pm_runtime.h> #include <linux/memory-failure.h> @@ -61,9 +64,12 @@ struct nvgrace_gpu_pci_core_device { struct mem_region resmem; /* Lock to control device memory kernel mapping */ struct mutex remap_lock; + void __iomem *bar0_base; bool has_mig_hw_bug; /* GPU has just been reset */ bool reset_done; + /* CXL Device DVSEC offset; 0 if not present (legacy GB path) */ + int cxl_dvsec; }; static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev) @@ -171,6 +177,7 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) struct nvgrace_gpu_pci_core_device *nvdev = container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + void __iomem *io; int ret; ret = vfio_pci_core_enable(vdev); @@ -184,14 +191,14 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) /* * GPU readiness is checked by reading the BAR0 registers. - * - * ioremap BAR0 to ensure that the BAR0 mapping is present before - * register reads on first fault before establishing any GPU - * memory mapping. + * The BAR map was just set up by vfio_pci_core_enable(), so + * bail early if that wasn't successful: */ - ret = vfio_pci_core_setup_barmap(vdev, 0); - if (ret) + io = vfio_pci_core_get_iomap(vdev, 0); + if (IS_ERR(io)) { + ret = PTR_ERR(io); goto error_exit; + } if (nvdev->resmem.memlength) { ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem); @@ -204,6 +211,8 @@ static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) goto register_mem_failed; vfio_pci_core_finish_enable(vdev); + nvdev->bar0_base = io; + return 0; register_mem_failed: @@ -220,6 +229,8 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) container_of(core_vdev, struct nvgrace_gpu_pci_core_device, core_device.vdev); + nvdev->bar0_base = NULL; + if (nvdev->resmem.memlength) unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); @@ -242,7 +253,7 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) vfio_pci_core_close_device(core_vdev); } -static int nvgrace_gpu_wait_device_ready(void __iomem *io) +static int nvgrace_gpu_wait_device_ready_legacy(void __iomem *io) { unsigned long timeout = jiffies + msecs_to_jiffies(POLL_TIMEOUT_MS); @@ -250,16 +261,97 @@ static int nvgrace_gpu_wait_device_ready(void __iomem *io) if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) && (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) return 0; - msleep(POLL_QUANTUM_MS); + if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS))) + return -EINTR; } while (!time_after(jiffies, timeout)); return -ETIME; } /* + * Decode the 3-bit Memory_Active_Timeout field from CXL DVSEC Range 1 Low + * (bits 15:13) into milliseconds. Encoding per CXL spec r4.0 sec 8.1.3.8.2: + * 000b = 1s, 001b = 4s, 010b = 16s, 011b = 64s, 100b = 256s, + * 101b-111b = reserved (clamped to 256s). + */ +static inline unsigned long cxl_mem_active_timeout_ms(u8 timeout) +{ + return MSEC_PER_SEC << (2 * min_t(u8, timeout, 4)); +} + +/* + * Check if CXL DVSEC reports memory as valid and active. + */ +static inline bool cxl_dvsec_mem_is_active(u32 status) +{ + return (status & PCI_DVSEC_CXL_MEM_INFO_VALID) && + (status & PCI_DVSEC_CXL_MEM_ACTIVE); +} + +static int nvgrace_gpu_test_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev, + u32 *status) +{ + struct pci_dev *pdev = nvdev->core_device.pdev; + int cxl_dvsec = nvdev->cxl_dvsec; + u32 val; + + pci_read_config_dword(pdev, + cxl_dvsec + PCI_DVSEC_CXL_RANGE_SIZE_LOW(0), + &val); + + if (val == ~0U) + return -ENODEV; + + if (status) + *status = val; + + if (cxl_dvsec_mem_is_active(val)) + return 0; + + return -EAGAIN; +} + +/* + * As per CXL spec r4.0 sec 8.1.3.8.2, MEM_INFO_VALID needs to be set + * within 1s and MEM_ACTIVE within Memory_Active_Timeout (up to ~256s) + * after reset and bootup. + */ +static int nvgrace_gpu_wait_device_ready_cxl(struct nvgrace_gpu_pci_core_device *nvdev) +{ + unsigned long deadline = jiffies + msecs_to_jiffies(POLL_QUANTUM_MS); + bool active_phase = false; + u32 status; + int ret; + + for (;;) { + ret = nvgrace_gpu_test_device_ready_cxl(nvdev, &status); + if (ret != -EAGAIN) + return ret; + + if (!active_phase && (status & PCI_DVSEC_CXL_MEM_INFO_VALID)) { + u8 t = FIELD_GET(PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT, status); + + deadline = jiffies + + msecs_to_jiffies(cxl_mem_active_timeout_ms(t)); + active_phase = true; + } + + if (time_after(jiffies, deadline)) + return -ETIME; + + if (schedule_timeout_killable(msecs_to_jiffies(POLL_QUANTUM_MS))) + return -EINTR; + } +} + +/* * If the GPU memory is accessed by the CPU while the GPU is not ready * after reset, it can cause harmless corrected RAS events to be logged. * Make sure the GPU is ready before establishing the mappings. + * + * Since the CXL polling wait could take 256s, it happens outside + * memory_lock. Only do quick readiness check under the lock. Legacy + * keeps the in-lock poll. */ static int nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) @@ -275,7 +367,10 @@ nvgrace_gpu_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) if (!__vfio_pci_memory_enabled(vdev)) return -EIO; - ret = nvgrace_gpu_wait_device_ready(vdev->barmap[0]); + if (nvdev->cxl_dvsec) + ret = nvgrace_gpu_test_device_ready_cxl(nvdev, NULL); + else + ret = nvgrace_gpu_wait_device_ready_legacy(nvdev->bar0_base); if (ret) return ret; @@ -313,9 +408,33 @@ static vm_fault_t nvgrace_gpu_vfio_pci_huge_fault(struct vm_fault *vmf, pfn = PHYS_PFN(memregion->memphys) + addr_to_pgoff(vma, addr); if (is_aligned_for_order(vma, addr, pfn, order)) { + /* + * Exit early under memory_lock to avoid a potentially lengthy + * device readiness wait on a runtime-suspended device. Any + * race after the lock is dropped is benign as the re-check + * inside the scoped guard below catches it. + */ scoped_guard(rwsem_read, &vdev->memory_lock) { - if (vdev->pm_runtime_engaged || - nvgrace_gpu_check_device_ready(nvdev)) + if (vdev->pm_runtime_engaged) + return VM_FAULT_SIGBUS; + } + +retry: + if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done) && + nvgrace_gpu_wait_device_ready_cxl(nvdev)) + return VM_FAULT_SIGBUS; + + scoped_guard(rwsem_read, &vdev->memory_lock) { + int rc; + + if (vdev->pm_runtime_engaged) + return VM_FAULT_SIGBUS; + + /* Re-run the wait if a reset raced us, not SIGBUS. */ + rc = nvgrace_gpu_check_device_ready(nvdev); + if (rc == -EAGAIN) + goto retry; + if (rc) return VM_FAULT_SIGBUS; ret = vfio_pci_vmf_insert_pfn(vdev, vmf, pfn, order); @@ -712,6 +831,12 @@ nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev, else mem_count = min(count, memregion->memlength - (size_t)offset); + if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) { + ret = nvgrace_gpu_wait_device_ready_cxl(nvdev); + if (ret) + return ret; + } + scoped_guard(rwsem_read, &vdev->memory_lock) { ret = nvgrace_gpu_check_device_ready(nvdev); if (ret) @@ -846,6 +971,12 @@ nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev, */ mem_count = min(count, memregion->memlength - (size_t)offset); + if (nvdev->cxl_dvsec && READ_ONCE(nvdev->reset_done)) { + ret = nvgrace_gpu_wait_device_ready_cxl(nvdev); + if (ret) + return ret; + } + scoped_guard(rwsem_read, &vdev->memory_lock) { ret = nvgrace_gpu_check_device_ready(nvdev); if (ret) @@ -1143,14 +1274,24 @@ static bool nvgrace_gpu_has_mig_hw_bug(struct pci_dev *pdev) * is beneficial to make the check to ensure the device is in an * expected state. * - * Ensure that the BAR0 region is enabled before accessing the + * On Blackwell-Next systems, memory readiness is determined via the + * CXL Device DVSEC in PCI config space and does not require BAR0. + * For the legacy path, ensure BAR0 is enabled before accessing the * registers. */ -static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) +static int nvgrace_gpu_probe_check_device_ready(struct nvgrace_gpu_pci_core_device *nvdev) { + struct pci_dev *pdev = nvdev->core_device.pdev; void __iomem *io; int ret; + /* + * Note that the worst-case wait here is ~256s (vs ~30s on the + * legacy path) and may block device unbind/sysfs for the duration. + */ + if (nvdev->cxl_dvsec) + return nvgrace_gpu_wait_device_ready_cxl(nvdev); + ret = pci_enable_device(pdev); if (ret) return ret; @@ -1165,7 +1306,7 @@ static int nvgrace_gpu_probe_check_device_ready(struct pci_dev *pdev) goto iomap_exit; } - ret = nvgrace_gpu_wait_device_ready(io); + ret = nvgrace_gpu_wait_device_ready_legacy(io); pci_iounmap(pdev, io); iomap_exit: @@ -1183,10 +1324,6 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, u64 memphys, memlength; int ret; - ret = nvgrace_gpu_probe_check_device_ready(pdev); - if (ret) - return ret; - ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength); if (!ret) ops = &nvgrace_gpu_pci_ops; @@ -1196,6 +1333,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev, if (IS_ERR(nvdev)) return PTR_ERR(nvdev); + nvdev->cxl_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, + PCI_DVSEC_CXL_DEVICE); + + ret = nvgrace_gpu_probe_check_device_ready(nvdev); + if (ret) + goto out_put_vdev; + dev_set_drvdata(&pdev->dev, &nvdev->core_device); if (ops == &nvgrace_gpu_pci_ops) { diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index ac9652539d66..60ff907b6a67 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -298,14 +298,18 @@ static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, return -ESPIPE; offs = &filp->f_pos; - if (*offs < 0 || - check_add_overflow(len, *offs, &end)) - return -EOVERFLOW; + mutex_lock(&migf->lock); - if (end > mig_dev->state_size) - return -ENOMEM; + if (*offs < 0 || check_add_overflow(len, *offs, &end)) { + done = -EOVERFLOW; + goto out_unlock; + } + + if (end > mig_dev->state_size) { + done = -ENOMEM; + goto out_unlock; + } - mutex_lock(&migf->lock); if (migf->disabled) { done = -ENODEV; goto out_unlock; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 050e7542952e..a28f1e99362c 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -271,8 +271,11 @@ int vfio_pci_set_power_state(struct vfio_pci_core_device *vdev, pci_power_t stat int ret; /* Prevent changing power state for PFs with VFs enabled */ - if (pci_num_vf(pdev) && state > PCI_D0) - return -EBUSY; + if (state > PCI_D0) { + lockdep_assert_held_write(&vdev->memory_lock); + if (vdev->sriov_active) + return -EBUSY; + } if (vdev->needs_pm_restore) { if (pdev->current_state < PCI_D3hot && state >= PCI_D3hot) { @@ -1762,7 +1765,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma struct pci_dev *pdev = vdev->pdev; unsigned int index; u64 phys_len, req_len, pgoff, req_start; - int ret; + void __iomem *bar_io; index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); @@ -1796,12 +1799,11 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma return -EINVAL; /* - * Even though we don't make use of the barmap for the mmap, - * we need to request the region and the barmap tracks that. + * Ensure the BAR resource region is reserved for use. */ - ret = vfio_pci_core_setup_barmap(vdev, index); - if (ret) - return ret; + bar_io = vfio_pci_core_get_iomap(vdev, index); + if (IS_ERR(bar_io)) + return PTR_ERR(bar_io); vma->vm_private_data = vdev; vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); @@ -2327,8 +2329,9 @@ int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, down_write(&vdev->memory_lock); vfio_pci_set_power_state(vdev, PCI_D0); - ret = pci_enable_sriov(pdev, nr_virtfn); + vdev->sriov_active = true; up_write(&vdev->memory_lock); + ret = pci_enable_sriov(pdev, nr_virtfn); if (ret) { pm_runtime_put(&pdev->dev); goto out_del; @@ -2342,6 +2345,13 @@ int vfio_pci_core_sriov_configure(struct vfio_pci_core_device *vdev, } out_del: + /* + * Avoid taking the memory_lock intentionally. A race with a power + * state transition would at most result in an -EBUSY, leaving the + * device in PCI_D0. + */ + vdev->sriov_active = false; + mutex_lock(&vfio_pci_sriov_pfs_mutex); list_del_init(&vdev->sriov_pfs_item); out_unlock: diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 1a177ce7de54..c16f460c01d6 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -248,7 +248,7 @@ int vfio_pci_core_feature_dma_buf(struct vfio_pci_core_device *vdev, u32 flags, * else. Check that PCI resources have been claimed for it. */ if (get_dma_buf.region_index >= VFIO_PCI_ROM_REGION_INDEX || - vfio_pci_core_setup_barmap(vdev, get_dma_buf.region_index)) + IS_ERR(vfio_pci_core_get_iomap(vdev, get_dma_buf.region_index))) return -ENODEV; dma_ranges = memdup_array_user(&arg->dma_ranges, get_dma_buf.nr_ranges, diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 3bfbb879a005..7f14dd46de17 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -198,19 +198,6 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, } EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw); -/* - * The barmap is set up in vfio_pci_core_enable(). Callers use this - * function to check that the BAR resources are requested or that the - * pci_iomap() was done. - */ -int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar) -{ - if (IS_ERR(vdev->barmap[bar])) - return PTR_ERR(vdev->barmap[bar]); - return 0; -} -EXPORT_SYMBOL_GPL(vfio_pci_core_setup_barmap); - ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -262,13 +249,11 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf, */ max_width = VFIO_PCI_IO_WIDTH_4; } else { - int ret = vfio_pci_core_setup_barmap(vdev, bar); - if (ret) { - done = ret; + io = vfio_pci_core_get_iomap(vdev, bar); + if (IS_ERR(io)) { + done = PTR_ERR(io); goto out; } - - io = vdev->barmap[bar]; } if (bar == vdev->msix_bar) { @@ -423,6 +408,7 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, loff_t pos = offset & VFIO_PCI_OFFSET_MASK; int ret, bar = VFIO_PCI_OFFSET_TO_INDEX(offset); struct vfio_pci_ioeventfd *ioeventfd; + void __iomem *io; /* Only support ioeventfds into BARs */ if (bar > VFIO_PCI_BAR5_REGION_INDEX) @@ -440,9 +426,9 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, if (count == 8) return -EINVAL; - ret = vfio_pci_core_setup_barmap(vdev, bar); - if (ret) - return ret; + io = vfio_pci_core_get_iomap(vdev, bar); + if (IS_ERR(io)) + return PTR_ERR(io); mutex_lock(&vdev->ioeventfds_lock); @@ -479,7 +465,7 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, } ioeventfd->vdev = vdev; - ioeventfd->addr = vdev->barmap[bar] + pos; + ioeventfd->addr = io + pos; ioeventfd->data = data; ioeventfd->pos = pos; ioeventfd->bar = bar; diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c index f022301e60d6..74ff302edc9f 100644 --- a/drivers/vfio/pci/virtio/legacy_io.c +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -298,19 +298,18 @@ int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) { struct vfio_pci_core_device *core_device = &virtvdev->core_device; - int ret; + void __iomem *io; /* * Setup the BAR where the 'notify' exists to be used by vfio as well * This will let us mmap it only once and use it when needed. */ - ret = vfio_pci_core_setup_barmap(core_device, - virtvdev->notify_bar); - if (ret) - return ret; + io = vfio_pci_core_get_iomap(core_device, + virtvdev->notify_bar); + if (IS_ERR(io)) + return PTR_ERR(io); - virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + - virtvdev->notify_offset; + virtvdev->notify_addr = io + virtvdev->notify_offset; return 0; } diff --git a/drivers/vfio/pci/xe/main.c b/drivers/vfio/pci/xe/main.c index 4ecadbbfd86e..cbff5af385ef 100644 --- a/drivers/vfio/pci/xe/main.c +++ b/drivers/vfio/pci/xe/main.c @@ -135,8 +135,6 @@ static void xe_vfio_pci_reset_done(struct pci_dev *pdev) } spin_unlock(&xe_vdev->reset_lock); xe_vfio_pci_state_mutex_unlock(xe_vdev); - - xe_vfio_pci_reset(xe_vdev); } static const struct pci_error_handlers xe_vfio_pci_err_handlers = { diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 6222376ab6ab..5e0422014523 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -858,7 +858,8 @@ int vfio_mig_get_next_state(struct vfio_device *device, * logical state, as per the above comment. */ *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm]; - while ((state_flags_table[*next_fsm] & device->migration_flags) != + while (*next_fsm != VFIO_DEVICE_STATE_ERROR && + (state_flags_table[*next_fsm] & device->migration_flags) != state_flags_table[*next_fsm]) *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm]; diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h index 89165b769e5c..5fc6ce4dd786 100644 --- a/include/linux/vfio_pci_core.h +++ b/include/linux/vfio_pci_core.h @@ -127,6 +127,7 @@ struct vfio_pci_core_device { bool needs_pm_restore:1; bool pm_intx_masked:1; bool pm_runtime_engaged:1; + bool sriov_active; struct pci_saved_state *pci_saved_state; struct pci_saved_state *pm_save; int ioeventfds_nr; @@ -188,7 +189,6 @@ int vfio_pci_core_match_token_uuid(struct vfio_device *core_vdev, int vfio_pci_core_enable(struct vfio_pci_core_device *vdev); void vfio_pci_core_disable(struct vfio_pci_core_device *vdev); void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev); -int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar); pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev, pci_channel_state_t state); ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem, @@ -234,6 +234,25 @@ static inline bool is_aligned_for_order(struct vm_area_struct *vma, !IS_ALIGNED(pfn, 1 << order))); } +/* + * Returns a BAR's iomap base or an ERR_PTR() if, for example, the + * BAR isn't valid, its resource wasn't acquired, or its iomap + * failed. This shall only be used after vfio_pci_core_enable() + * has set up the BAR maps and before vfio_pci_core_disable() + * tears them down. + */ +static inline void __iomem __must_check * +vfio_pci_core_get_iomap(struct vfio_pci_core_device *vdev, unsigned int bar) +{ + if (WARN_ON_ONCE(bar >= PCI_STD_NUM_BARS)) + return IOMEM_ERR_PTR(-EINVAL); + + if (WARN_ON_ONCE(!vdev->barmap[bar])) + return IOMEM_ERR_PTR(-ENODEV); + + return vdev->barmap[bar]; +} + int vfio_pci_dma_buf_iommufd_map(struct dma_buf_attachment *attachment, struct phys_vec *phys); diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 14f634ab9350..718fb630f5bb 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -1357,6 +1357,7 @@ #define PCI_DVSEC_CXL_RANGE_SIZE_LOW(i) (0x1C + (i * 0x10)) #define PCI_DVSEC_CXL_MEM_INFO_VALID _BITUL(0) #define PCI_DVSEC_CXL_MEM_ACTIVE _BITUL(1) +#define PCI_DVSEC_CXL_MEM_ACTIVE_TIMEOUT __GENMASK(15, 13) #define PCI_DVSEC_CXL_MEM_SIZE_LOW __GENMASK(31, 28) #define PCI_DVSEC_CXL_RANGE_BASE_HIGH(i) (0x20 + (i * 0x10)) #define PCI_DVSEC_CXL_RANGE_BASE_LOW(i) (0x24 + (i * 0x10)) diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 0684932d91bf..e6e8cb52ab03 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -1,6 +1,6 @@ ARCH ?= $(shell uname -m) -ifeq (,$(filter $(ARCH),aarch64 arm64 x86_64)) +ifeq (,$(filter $(ARCH),aarch64 arm64 x86 x86_64)) # Do nothing on unsupported architectures include ../lib.mk else @@ -12,6 +12,7 @@ TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test TEST_GEN_PROGS += vfio_pci_device_init_perf_test TEST_GEN_PROGS += vfio_pci_driver_test +TEST_GEN_PROGS += vfio_pci_sriov_uapi_test TEST_FILES += scripts/cleanup.sh TEST_FILES += scripts/lib.sh @@ -23,14 +24,20 @@ include lib/libvfio.mk CFLAGS += -I$(top_srcdir)/tools/include CFLAGS += -MD +CFLAGS += -Wall -Werror CFLAGS += $(EXTRA_CFLAGS) LDFLAGS += -pthread -$(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) +LDLIBS += -luuid + +$(TEST_GEN_PROGS): $(OUTPUT)/%: $(OUTPUT)/%.o $(LIBVFIO_O) $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ TEST_GEN_PROGS_O = $(patsubst %, %.o, $(TEST_GEN_PROGS)) +$(TEST_GEN_PROGS_O): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_PROGS_O) $(LIBVFIO_O)) -include $(TEST_DEP_FILES) diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h index 1b6da54cc2cb..07862b470777 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -5,6 +5,7 @@ #include <libvfio/assert.h> #include <libvfio/iommu.h> #include <libvfio/iova_allocator.h> +#include <libvfio/sysfs.h> #include <libvfio/vfio_pci_device.h> #include <libvfio/vfio_pci_driver.h> diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h index f4ebd122d9b6..77b68c7129a6 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h @@ -51,4 +51,9 @@ VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ } while (0) +#define snprintf_assert(_s, _size, _fmt, ...) do { \ + int __ret = snprintf(_s, _size, _fmt, ##__VA_ARGS__); \ + VFIO_ASSERT_LT(__ret, _size); \ +} while (0) + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h b/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h new file mode 100644 index 000000000000..c9ab1ea8f5a9 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/sysfs.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H + +int sysfs_sriov_totalvfs_get(const char *bdf); +int sysfs_sriov_numvfs_get(const char *bdf); +void sysfs_sriov_numvfs_set(const char *bdf, int numvfs); +char *sysfs_sriov_vf_bdf_get(const char *pf_bdf, int i); +int sysfs_iommu_group_get(const char *bdf); +char *sysfs_driver_get(const char *bdf); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_SYSFS_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h index 2858885a89bb..3eabead717bb 100644 --- a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -38,6 +38,8 @@ struct vfio_pci_device { #define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) #define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +struct vfio_pci_device *vfio_pci_device_alloc(const char *bdf, struct iommu *iommu); +void vfio_pci_device_free(struct vfio_pci_device *device); struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); void vfio_pci_device_cleanup(struct vfio_pci_device *device); @@ -122,4 +124,13 @@ static inline bool vfio_pci_device_match(struct vfio_pci_device *device, const char *vfio_pci_get_cdev_path(const char *bdf); +void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf); +void __vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token); +void vfio_container_set_iommu(struct vfio_pci_device *device); +void vfio_pci_cdev_open(struct vfio_pci_device *device, const char *bdf); +int __vfio_device_bind_iommufd(int device_fd, int iommufd, const char *vf_token); + +void vfio_device_set_vf_token(int fd, const char *vf_token); + #endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */ diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 9f47bceed16f..2b8d73b7d329 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -6,6 +6,7 @@ LIBVFIO_SRCDIR := $(selfdir)/vfio/lib LIBVFIO_C := iommu.c LIBVFIO_C += iova_allocator.c LIBVFIO_C += libvfio.c +LIBVFIO_C += sysfs.c LIBVFIO_C += vfio_pci_device.c LIBVFIO_C += vfio_pci_driver.c @@ -19,11 +20,13 @@ LIBVFIO_OUTPUT := $(OUTPUT)/libvfio LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) -$(shell mkdir -p $(LIBVFIO_O_DIRS)) + +$(LIBVFIO_O_DIRS): + mkdir -p $@ CFLAGS += -I$(LIBVFIO_SRCDIR)/include -$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c +$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c | $(LIBVFIO_O_DIRS) $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ EXTRA_CLEAN += $(LIBVFIO_OUTPUT) diff --git a/tools/testing/selftests/vfio/lib/sysfs.c b/tools/testing/selftests/vfio/lib/sysfs.c new file mode 100644 index 000000000000..11415448b2e2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/sysfs.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <linux/limits.h> + +#include <libvfio.h> + +#define readlink_safe(_path, _buf) ({ \ + int __ret; \ + \ + _Static_assert(!__builtin_types_compatible_p( \ + __typeof__(_buf), char *), \ + "readlink_safe: _buf must be an array, not a pointer"); \ + \ + __ret = readlink(_path, _buf, sizeof(_buf) - 1); \ + if (__ret != -1) \ + _buf[__ret] = 0; \ + __ret; \ +}) + +static void readlink_base(const char *path, const char *data_fmt, void *out_data) +{ + char rl_path[PATH_MAX]; + int ret; + + ret = readlink_safe(path, rl_path); + VFIO_ASSERT_NE(ret, -1); + + ret = sscanf(basename(rl_path), data_fmt, out_data); + VFIO_ASSERT_EQ(ret, 1); +} + +static int sysfs_val_get_int(const char *component, const char *name, + const char *file) +{ + char path[PATH_MAX]; + char buf[32]; + int ret; + int fd; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/%s/%s/%s", component, name, file); + fd = open(path, O_RDONLY); + if (fd < 0) + return fd; + + VFIO_ASSERT_GT(read(fd, buf, ARRAY_SIZE(buf)), 0); + VFIO_ASSERT_EQ(close(fd), 0); + + errno = 0; + ret = strtol(buf, NULL, 0); + VFIO_ASSERT_EQ(errno, 0, "sysfs path \"%s\" is not an integer: \"%s\"\n", path, buf); + + return ret; +} + +static void sysfs_val_set(const char *component, const char *name, + const char *file, const char *val) +{ + char path[PATH_MAX]; + int fd; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/%s/%s/%s", component, name, file); + VFIO_ASSERT_GT(fd = open(path, O_WRONLY), 0); + + VFIO_ASSERT_EQ(write(fd, val, strlen(val)), strlen(val)); + VFIO_ASSERT_EQ(close(fd), 0); +} + +static int sysfs_device_val_get(const char *bdf, const char *file) +{ + return sysfs_val_get_int("devices", bdf, file); +} + +static void sysfs_device_val_set(const char *bdf, const char *file, const char *val) +{ + sysfs_val_set("devices", bdf, file, val); +} + +static void sysfs_device_val_set_int(const char *bdf, const char *file, int val) +{ + char val_str[32]; + + snprintf_assert(val_str, sizeof(val_str), "%d", val); + sysfs_device_val_set(bdf, file, val_str); +} + +int sysfs_sriov_totalvfs_get(const char *bdf) +{ + return sysfs_device_val_get(bdf, "sriov_totalvfs"); +} + +int sysfs_sriov_numvfs_get(const char *bdf) +{ + return sysfs_device_val_get(bdf, "sriov_numvfs"); +} + +void sysfs_sriov_numvfs_set(const char *bdf, int numvfs) +{ + sysfs_device_val_set_int(bdf, "sriov_numvfs", numvfs); +} + +char *sysfs_sriov_vf_bdf_get(const char *pf_bdf, int i) +{ + char path[PATH_MAX]; + char *out_vf_bdf; + + /* Fit "0000:00:00.0" */ + out_vf_bdf = calloc(16, sizeof(char)); + VFIO_ASSERT_NOT_NULL(out_vf_bdf); + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/virtfn%d", pf_bdf, i); + readlink_base(path, "%s", out_vf_bdf); + + return out_vf_bdf; +} + +int sysfs_iommu_group_get(const char *bdf) +{ + char path[PATH_MAX]; + int group; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/iommu_group", bdf); + readlink_base(path, "%d", &group); + + return group; +} + +char *sysfs_driver_get(const char *bdf) +{ + char driver_path[PATH_MAX]; + char path[PATH_MAX]; + char *out_driver; + int ret; + + snprintf_assert(path, PATH_MAX, "/sys/bus/pci/devices/%s/driver", bdf); + ret = readlink_safe(path, driver_path); + if (ret == -1) { + if (errno == ENOENT) + return NULL; + + VFIO_FAIL("Failed to read %s\n", path); + } + + out_driver = strdup(basename(driver_path)); + VFIO_ASSERT_NOT_NULL(out_driver); + + return out_driver; +} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index fc75e04ef010..94dc5fcecbeb 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -22,11 +22,11 @@ #include <linux/types.h> #include <linux/vfio.h> +#include <uuid/uuid.h> + #include "kselftest.h" #include <libvfio.h> -#define PCI_SYSFS_PATH "/sys/bus/pci/devices" - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -115,6 +115,40 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } +static int vfio_device_feature_ioctl(int fd, u32 flags, void *data, + size_t data_size) +{ + u8 buffer[sizeof(struct vfio_device_feature) + data_size] = {}; + struct vfio_device_feature *feature = (void *)buffer; + + memcpy(feature->data, data, data_size); + + feature->argsz = sizeof(buffer); + feature->flags = flags; + + return ioctl(fd, VFIO_DEVICE_FEATURE, feature); +} + +static void vfio_device_feature_set(int fd, u16 feature, void *data, size_t data_size) +{ + u32 flags = VFIO_DEVICE_FEATURE_SET | feature; + int ret; + + ret = vfio_device_feature_ioctl(fd, flags, data, data_size); + VFIO_ASSERT_EQ(ret, 0, "Failed to set feature %u\n", feature); +} + +void vfio_device_set_vf_token(int fd, const char *vf_token) +{ + uuid_t token_uuid = {0}; + + VFIO_ASSERT_NOT_NULL(vf_token, "vf_token is NULL"); + VFIO_ASSERT_EQ(uuid_parse(vf_token, token_uuid), 0); + + vfio_device_feature_set(fd, VFIO_DEVICE_FEATURE_PCI_VF_TOKEN, + token_uuid, sizeof(uuid_t)); +} + static void vfio_pci_region_get(struct vfio_pci_device *device, int index, struct vfio_region_info *info) { @@ -204,25 +238,7 @@ void vfio_pci_device_reset(struct vfio_pci_device *device) ioctl_assert(device->fd, VFIO_DEVICE_RESET, NULL); } -static unsigned int vfio_pci_get_group_from_dev(const char *bdf) -{ - char dev_iommu_group_path[PATH_MAX] = {0}; - char sysfs_path[PATH_MAX] = {0}; - unsigned int group; - int ret; - - snprintf(sysfs_path, PATH_MAX, "%s/%s/iommu_group", PCI_SYSFS_PATH, bdf); - - ret = readlink(sysfs_path, dev_iommu_group_path, sizeof(dev_iommu_group_path)); - VFIO_ASSERT_NE(ret, -1, "Failed to get the IOMMU group for device: %s\n", bdf); - - ret = sscanf(basename(dev_iommu_group_path), "%u", &group); - VFIO_ASSERT_EQ(ret, 1, "Failed to get the IOMMU group for device: %s\n", bdf); - - return group; -} - -static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) +void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf) { struct vfio_group_status group_status = { .argsz = sizeof(group_status), @@ -230,8 +246,8 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf char group_path[32]; int group; - group = vfio_pci_get_group_from_dev(bdf); - snprintf(group_path, sizeof(group_path), "/dev/vfio/%d", group); + group = sysfs_iommu_group_get(bdf); + snprintf_assert(group_path, sizeof(group_path), "/dev/vfio/%d", group); device->group_fd = open(group_path, O_RDWR); VFIO_ASSERT_GE(device->group_fd, 0, "open(%s) failed\n", group_path); @@ -242,14 +258,37 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd); } -static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) +void __vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + char arg[64]; + + /* + * If a vf_token exists, argument to VFIO_GROUP_GET_DEVICE_FD + * will be in the form of the following example: + * "0000:04:10.0 vf_token=bd8d9d2b-5a5f-4f5a-a211-f591514ba1f3" + */ + if (vf_token) + snprintf_assert(arg, ARRAY_SIZE(arg), "%s vf_token=%s", bdf, vf_token); + else + snprintf_assert(arg, ARRAY_SIZE(arg), "%s", bdf); + + device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, arg); +} + +static void vfio_pci_group_get_device_fd(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + __vfio_pci_group_get_device_fd(device, bdf, vf_token); + VFIO_ASSERT_GE(device->fd, 0); +} + +void vfio_container_set_iommu(struct vfio_pci_device *device) { struct iommu *iommu = device->iommu; unsigned long iommu_type = iommu->mode->iommu_type; int ret; - vfio_pci_group_setup(device, bdf); - ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); @@ -259,9 +298,14 @@ static void vfio_pci_container_setup(struct vfio_pci_device *device, const char * because the IOMMU type is already set. */ (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); +} - device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); - VFIO_ASSERT_GE(device->fd, 0); +static void vfio_pci_container_setup(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + vfio_pci_group_setup(device, bdf); + vfio_container_set_iommu(device); + vfio_pci_group_get_device_fd(device, bdf, vf_token); } static void vfio_pci_device_setup(struct vfio_pci_device *device) @@ -302,7 +346,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) cdev_path = calloc(PATH_MAX, 1); VFIO_ASSERT_NOT_NULL(cdev_path); - snprintf(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); + snprintf_assert(dir_path, sizeof(dir_path), "/sys/bus/pci/devices/%s/vfio-dev/", bdf); dir = opendir(dir_path); VFIO_ASSERT_NOT_NULL(dir, "Failed to open directory %s\n", dir_path); @@ -312,7 +356,7 @@ const char *vfio_pci_get_cdev_path(const char *bdf) if (strncmp("vfio", entry->d_name, 4)) continue; - snprintf(cdev_path, PATH_MAX, "/dev/vfio/devices/%s", entry->d_name); + snprintf_assert(cdev_path, PATH_MAX, "/dev/vfio/devices/%s", entry->d_name); break; } @@ -322,14 +366,32 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -static void vfio_device_bind_iommufd(int device_fd, int iommufd) +int __vfio_device_bind_iommufd(int device_fd, int iommufd, const char *vf_token) { struct vfio_device_bind_iommufd args = { .argsz = sizeof(args), .iommufd = iommufd, }; + uuid_t token_uuid; + + if (vf_token) { + VFIO_ASSERT_EQ(uuid_parse(vf_token, token_uuid), 0); + args.flags |= VFIO_DEVICE_BIND_FLAG_TOKEN; + args.token_uuid_ptr = (u64)token_uuid; + } + + if (ioctl(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args)) + return -errno; + + return 0; +} + +static void vfio_device_bind_iommufd(int device_fd, int iommufd, + const char *vf_token) +{ + int ret = __vfio_device_bind_iommufd(device_fd, iommufd, vf_token); - ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); + VFIO_ASSERT_EQ(ret, 0, "Failed VFIO_DEVICE_BIND_IOMMUFD ioctl\n"); } static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) @@ -342,19 +404,24 @@ static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) ioctl_assert(device_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &args); } -static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *bdf) +void vfio_pci_cdev_open(struct vfio_pci_device *device, const char *bdf) { const char *cdev_path = vfio_pci_get_cdev_path(bdf); device->fd = open(cdev_path, O_RDWR); VFIO_ASSERT_GE(device->fd, 0); free((void *)cdev_path); +} - vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); +static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, + const char *bdf, const char *vf_token) +{ + vfio_pci_cdev_open(device, bdf); + vfio_device_bind_iommufd(device->fd, device->iommu->iommufd, vf_token); vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) +struct vfio_pci_device *vfio_pci_device_alloc(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; @@ -365,10 +432,24 @@ struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iomm device->iommu = iommu; device->bdf = bdf; + return device; +} + +void vfio_pci_device_free(struct vfio_pci_device *device) +{ + free(device); +} + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) +{ + struct vfio_pci_device *device; + + device = vfio_pci_device_alloc(bdf, iommu); + if (iommu->mode->container_path) - vfio_pci_container_setup(device, bdf); + vfio_pci_container_setup(device, bdf, NULL); else - vfio_pci_iommufd_setup(device, bdf); + vfio_pci_iommufd_setup(device, bdf, NULL); vfio_pci_device_setup(device); vfio_pci_driver_probe(device); @@ -397,5 +478,5 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) if (device->group_fd) VFIO_ASSERT_EQ(close(device->group_fd), 0); - free(device); + vfio_pci_device_free(device); } diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index abb170bdcef7..7d0de8c79de1 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -44,9 +44,9 @@ static int intel_iommu_mapping_get(const char *bdf, u64 iova, FILE *file; char *rest; - snprintf(iommu_mapping_path, sizeof(iommu_mapping_path), - "/sys/kernel/debug/iommu/intel/%s/domain_translation_struct", - bdf); + snprintf_assert(iommu_mapping_path, sizeof(iommu_mapping_path), + "/sys/kernel/debug/iommu/intel/%s/domain_translation_struct", + bdf); printf("Searching for IOVA 0x%lx in %s\n", iova, iommu_mapping_path); diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7c0fe8ce3a61..93c11fd5e081 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -39,16 +39,17 @@ FIXTURE_TEARDOWN(vfio_pci_device_test) iommu_cleanup(self->iommu); } -#define read_pci_id_from_sysfs(_file) ({ \ - char __sysfs_path[PATH_MAX]; \ - char __buf[32]; \ - int __fd; \ - \ - snprintf(__sysfs_path, PATH_MAX, "/sys/bus/pci/devices/%s/%s", device_bdf, _file); \ - ASSERT_GT((__fd = open(__sysfs_path, O_RDONLY)), 0); \ - ASSERT_GT(read(__fd, __buf, ARRAY_SIZE(__buf)), 0); \ - ASSERT_EQ(0, close(__fd)); \ - (u16)strtoul(__buf, NULL, 0); \ +#define read_pci_id_from_sysfs(_file) ({ \ + char __sysfs_path[PATH_MAX]; \ + char __buf[32]; \ + int __fd; \ + \ + snprintf_assert(__sysfs_path, PATH_MAX, "/sys/bus/pci/devices/%s/%s", \ + device_bdf, _file); \ + ASSERT_GT((__fd = open(__sysfs_path, O_RDONLY)), 0); \ + ASSERT_GT(read(__fd, __buf, ARRAY_SIZE(__buf)), 0); \ + ASSERT_EQ(0, close(__fd)); \ + (u16)strtoul(__buf, NULL, 0); \ }) TEST_F(vfio_pci_device_test, config_space_read_write) diff --git a/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c b/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c new file mode 100644 index 000000000000..19d657d00b75 --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_sriov_uapi_test.c @@ -0,0 +1,217 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "lib/include/libvfio/assert.h" +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <linux/limits.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +#define UUID_1 "52ac9bff-3a88-4fbd-901a-0d767c3b6c97" +#define UUID_2 "88594674-90a0-47a9-aea8-9d9b352ac08a" + +static const char *pf_bdf; +static char *vf_bdf; + +static pid_t main_pid; + +static int container_setup(struct vfio_pci_device *device, const char *bdf, + const char *vf_token) +{ + vfio_pci_group_setup(device, bdf); + vfio_container_set_iommu(device); + __vfio_pci_group_get_device_fd(device, bdf, vf_token); + + /* The device fd will be -1 in case of mismatched tokens */ + return (device->fd < 0); +} + +static int iommufd_setup(struct vfio_pci_device *device, const char *bdf, + const char *vf_token) +{ + vfio_pci_cdev_open(device, bdf); + return __vfio_device_bind_iommufd(device->fd, + device->iommu->iommufd, vf_token); +} + +static int device_init(const char *bdf, struct iommu *iommu, + const char *vf_token, struct vfio_pci_device **out_dev) +{ + struct vfio_pci_device *device = vfio_pci_device_alloc(bdf, iommu); + int ret; + + if (iommu->mode->container_path) + ret = container_setup(device, bdf, vf_token); + else + ret = iommufd_setup(device, bdf, vf_token); + + *out_dev = device; + return ret; +} + +static void device_cleanup(struct vfio_pci_device *device) +{ + if (!device) + return; + + if (device->fd > 0) + VFIO_ASSERT_EQ(close(device->fd), 0); + + if (device->group_fd) + VFIO_ASSERT_EQ(close(device->group_fd), 0); + + vfio_pci_device_free(device); +} + +FIXTURE(vfio_pci_sriov_uapi_test) { + struct vfio_pci_device *pf; + struct vfio_pci_device *vf; + struct iommu *iommu; + char *pf_token; +}; + +FIXTURE_VARIANT(vfio_pci_sriov_uapi_test) { + const char *iommu_mode; + char *vf_token; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode, _name, _vf_token) \ +FIXTURE_VARIANT_ADD(vfio_pci_sriov_uapi_test, _iommu_mode ## _ ## _name) { \ + .iommu_mode = #_iommu_mode, \ + .vf_token = (_vf_token), \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(same_uuid, UUID_1); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(diff_uuid, UUID_2); +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(null_uuid, NULL); + +FIXTURE_SETUP(vfio_pci_sriov_uapi_test) +{ + self->iommu = iommu_init(variant->iommu_mode); + + self->pf_token = UUID_1; + ASSERT_EQ(device_init(pf_bdf, self->iommu, self->pf_token, &self->pf), 0); +} + +FIXTURE_TEARDOWN(vfio_pci_sriov_uapi_test) +{ + device_cleanup(self->vf); + device_cleanup(self->pf); + iommu_cleanup(self->iommu); +} + +/* + * This asserts if the VF device is successfully created if its token matches + * with the token used to create/override the PF or fails during a mismatch. + */ +#define ASSERT_COND_VF_CREATION(_ret) do { \ + if (!variant->vf_token || strcmp(self->pf_token, variant->vf_token)) { \ + ASSERT_NE((_ret), 0); \ + } else { \ + ASSERT_EQ((_ret), 0); \ + } \ +} while (0) + +/* + * Validate if the UAPI handles correctly and incorrectly set token on the VF. + */ +TEST_F(vfio_pci_sriov_uapi_test, init_token_match) +{ + int ret; + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +/* + * After closing the PF, validate if the VF access still needs the right token. + */ +TEST_F(vfio_pci_sriov_uapi_test, pf_early_close) +{ + int ret; + + device_cleanup(self->pf); + + /* Clean the 'pf' to avoid calling device_cleanup() again. */ + self->pf = NULL; + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +/* + * After PF device init, override the existing token and validate if the newly + * set token is the one that's active. + */ +TEST_F(vfio_pci_sriov_uapi_test, override_token) +{ + int ret; + + self->pf_token = UUID_2; + vfio_device_set_vf_token(self->pf->fd, self->pf_token); + + ret = device_init(vf_bdf, self->iommu, variant->vf_token, &self->vf); + ASSERT_COND_VF_CREATION(ret); +} + +static void vf_teardown(void) +{ + /* + * The child processes, created by TEST_F()s, inherits this atexit() + * handler. Hence, check and destroy the VF only when the main/parent + * process exits. + */ + if (getpid() != main_pid) + return; + + free(vf_bdf); + sysfs_sriov_numvfs_set(pf_bdf, 0); +} + +static void vf_setup(void) +{ + char *vf_driver; + int nr_vfs; + + nr_vfs = sysfs_sriov_totalvfs_get(pf_bdf); + if (nr_vfs <= 0) + ksft_exit_skip("SR-IOV may not be supported by the PF: %s\n", pf_bdf); + + nr_vfs = sysfs_sriov_numvfs_get(pf_bdf); + if (nr_vfs != 0) + ksft_exit_skip("SR-IOV already configured for the PF: %s\n", pf_bdf); + + /* Create only one VF for testing */ + sysfs_sriov_numvfs_set(pf_bdf, 1); + + /* + * Setup an exit handler to destroy the VF in case of failures + * during further setup at the end of the test run. + */ + main_pid = getpid(); + VFIO_ASSERT_EQ(atexit(vf_teardown), 0); + + vf_bdf = sysfs_sriov_vf_bdf_get(pf_bdf, 0); + + /* + * The VF inherits the driver from the PF. + * Ensure this is 'vfio-pci' before proceeding. + */ + vf_driver = sysfs_driver_get(vf_bdf); + VFIO_ASSERT_NE(vf_driver, NULL); + VFIO_ASSERT_EQ(strcmp(vf_driver, "vfio-pci"), 0); + free(vf_driver); + + printf("Created 1 VF (%s) under the PF: %s\n", vf_bdf, pf_bdf); +} + +int main(int argc, char *argv[]) +{ + pf_bdf = vfio_selftests_get_bdf(&argc, argv); + vf_setup(); + + return test_harness_run(argc, argv); +} |
