From 7141790b5dd47b229e550148789365a3b2192dd7 Mon Sep 17 00:00:00 2001 From: Angus Chen Date: Sat, 7 Jan 2023 11:47:20 +0800 Subject: vfio: platform: No need to check res again In function vfio_platform_regions_init(),we did check res implied by using while loop, so no need to check whether res be null or not again. No functional change intended. Signed-off-by: Angus Chen Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20230107034721.2127-1-angus.chen@jaguarmicro.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 1a0a238ffa35..a9ad3f4d2613 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -150,9 +150,6 @@ static int vfio_platform_regions_init(struct vfio_platform_device *vdev) struct resource *res = vdev->get_resource(vdev, i); - if (!res) - goto err; - vdev->regions[i].addr = res->start; vdev->regions[i].size = resource_size(res); vdev->regions[i].flags = 0; -- cgit v1.2.3 From 8bf8c5ee1f3863d944c1d8c29335f0c790b4f851 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 10 Jan 2023 10:10:07 +0100 Subject: vfio-mdev: turn VFIO_MDEV into a selectable symbol VFIO_MDEV is just a library with helpers for the drivers. Stop making it a user choice and just select it by the drivers that use the helpers. Signed-off-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Reviewed-by: Tony Krowiak Link: https://lore.kernel.org/r/20230110091009.474427-3-hch@lst.de Signed-off-by: Alex Williamson --- Documentation/s390/vfio-ap.rst | 1 - arch/s390/Kconfig | 6 ++++-- arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - drivers/gpu/drm/i915/Kconfig | 2 +- drivers/vfio/mdev/Kconfig | 8 +------- samples/Kconfig | 6 +++--- 7 files changed, 9 insertions(+), 16 deletions(-) (limited to 'drivers/vfio') diff --git a/Documentation/s390/vfio-ap.rst b/Documentation/s390/vfio-ap.rst index 00f4a04f6d4c..d46e98c7c1ec 100644 --- a/Documentation/s390/vfio-ap.rst +++ b/Documentation/s390/vfio-ap.rst @@ -553,7 +553,6 @@ These are the steps: * ZCRYPT * S390_AP_IOMMU * VFIO - * VFIO_MDEV * KVM If using make menuconfig select the following to build the vfio_ap module:: diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 7fd08755a1f9..85d2fe264d3d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -713,7 +713,8 @@ config EADM_SCH config VFIO_CCW def_tristate n prompt "Support for VFIO-CCW subchannels" - depends on S390_CCW_IOMMU && VFIO_MDEV + depends on S390_CCW_IOMMU + select VFIO_MDEV help This driver allows usage of I/O subchannels via VFIO-CCW. @@ -723,7 +724,8 @@ config VFIO_CCW config VFIO_AP def_tristate n prompt "VFIO support for AP devices" - depends on S390_AP_IOMMU && VFIO_MDEV && KVM + depends on S390_AP_IOMMU && KVM + select VFIO_MDEV depends on ZCRYPT help This driver grants access to Adjunct Processor (AP) devices diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 74b35ec2ad28..3c68fe49042c 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -594,7 +594,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index cec71268e3bc..9ab91632f74c 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -583,7 +583,6 @@ CONFIG_SYNC_FILE=y CONFIG_VFIO=m CONFIG_VFIO_PCI=m CONFIG_MLX5_VFIO_PCI=m -CONFIG_VFIO_MDEV=m CONFIG_VIRTIO_PCI=m CONFIG_VIRTIO_BALLOON=m CONFIG_VIRTIO_INPUT=y diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig index 3efce05d7b57..d06da455253c 100644 --- a/drivers/gpu/drm/i915/Kconfig +++ b/drivers/gpu/drm/i915/Kconfig @@ -116,9 +116,9 @@ config DRM_I915_GVT_KVMGT depends on X86 depends on 64BIT depends on KVM - depends on VFIO_MDEV select DRM_I915_GVT select KVM_EXTERNAL_WRITE_TRACKING + select VFIO_MDEV help Choose this option if you want to enable Intel GVT-g graphics diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig index 646dbed44eb2..e5fb84e07965 100644 --- a/drivers/vfio/mdev/Kconfig +++ b/drivers/vfio/mdev/Kconfig @@ -1,10 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only config VFIO_MDEV - tristate "Mediated device driver framework" - default n - help - Provides a framework to virtualize devices. - See Documentation/driver-api/vfio-mediated-device.rst for more details. - - If you don't know what do here, say N. + tristate diff --git a/samples/Kconfig b/samples/Kconfig index f1b8d4ff1230..56b191d128d8 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -185,14 +185,14 @@ config SAMPLE_UHID config SAMPLE_VFIO_MDEV_MTTY tristate "Build VFIO mtty example mediated device sample code" - depends on VFIO_MDEV + select VFIO_MDEV help Build a virtual tty sample driver for use as a VFIO mediated device config SAMPLE_VFIO_MDEV_MDPY tristate "Build VFIO mdpy example mediated device sample code" - depends on VFIO_MDEV + select VFIO_MDEV help Build a virtual display sample driver for use as a VFIO mediated device. It is a simple framebuffer and supports @@ -209,7 +209,7 @@ config SAMPLE_VFIO_MDEV_MDPY_FB config SAMPLE_VFIO_MDEV_MBOCHS tristate "Build VFIO mdpy example mediated device sample code" - depends on VFIO_MDEV + select VFIO_MDEV select DMA_SHARED_BUFFER help Build a virtual display sample driver for use as a VFIO -- cgit v1.2.3 From c9c4c070e0fe2551f82b20bbf14e4dbde88e573d Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Jan 2023 17:44:22 +0200 Subject: vfio/mlx5: Fix UBSAN note Prevent calling roundup_pow_of_two() with value of 0 as it causes the below UBSAN note. Move this code and its few extra related lines to be called only when it's really applicable. UBSAN: shift-out-of-bounds in ./include/linux/log2.h:57:13 shift exponent 64 is too large for 64-bit type 'long unsigned int' CPU: 15 PID: 1639 Comm: live_migration Not tainted 6.1.0-rc4 #1116 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x45/0x59 ubsan_epilogue+0x5/0x36 __ubsan_handle_shift_out_of_bounds.cold+0x61/0xef ? lock_is_held_type+0x98/0x110 ? rcu_read_lock_sched_held+0x3f/0x70 mlx5vf_create_rc_qp.cold+0xe4/0xf2 [mlx5_vfio_pci] mlx5vf_start_page_tracker+0x769/0xcd0 [mlx5_vfio_pci] vfio_device_fops_unl_ioctl+0x63f/0x700 [vfio] __x64_sys_ioctl+0x433/0x9a0 do_syscall_64+0x3d/0x90 entry_SYSCALL_64_after_hwframe+0x63/0xcd Fixes: 79c3cf279926 ("vfio/mlx5: Init QP based resources for dirty tracking") Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 64e68d13cb98..c5dcddbc4126 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -1036,14 +1036,14 @@ mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, if (!qp) return ERR_PTR(-ENOMEM); - qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); - log_rq_stride = ilog2(MLX5_SEND_WQE_DS); - log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_db_alloc_node(mdev, &qp->db, mdev->priv.numa_node); if (err) goto err_free; if (max_recv_wr) { + qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr); + log_rq_stride = ilog2(MLX5_SEND_WQE_DS); + log_rq_sz = ilog2(qp->rq.wqe_cnt); err = mlx5_frag_buf_alloc_node(mdev, wq_get_byte_sz(log_rq_sz, log_rq_stride), &qp->buf, mdev->priv.numa_node); -- cgit v1.2.3 From 83ff6095ecf8bafd095a921a2267227519f510f8 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Jan 2023 17:44:23 +0200 Subject: vfio/mlx5: Allow loading of larger images than 512 MB Allow loading of larger images than 512 MB by dropping the arbitrary hard-coded value that we have today and move to use the max device loading value which is for now 4GB. As part of that we move to use the GFP_KERNEL_ACCOUNT option upon allocating the persistent data of mlx5 and rely on the cgroup to provide the memory limit for the given user. The GFP_KERNEL_ACCOUNT option lets the memory allocator know that this is untrusted allocation triggered from userspace and should be a subject of kmem accounting, and as such it is controlled by the cgroup mechanism. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 11 ++++++----- drivers/vfio/pci/mlx5/main.c | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 14 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index c5dcddbc4126..0586f09c69af 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -373,7 +373,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *buf; int ret; - buf = kzalloc(sizeof(*buf), GFP_KERNEL); + buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) return ERR_PTR(-ENOMEM); @@ -1032,7 +1032,7 @@ mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev, void *in; int err; - qp = kzalloc(sizeof(*qp), GFP_KERNEL); + qp = kzalloc(sizeof(*qp), GFP_KERNEL_ACCOUNT); if (!qp) return ERR_PTR(-ENOMEM); @@ -1213,12 +1213,13 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, int i; recv_buf->page_list = kvcalloc(npages, sizeof(*recv_buf->page_list), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!recv_buf->page_list) return -ENOMEM; for (;;) { - filled = alloc_pages_bulk_array(GFP_KERNEL, npages - done, + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, + npages - done, recv_buf->page_list + done); if (!filled) goto err; @@ -1248,7 +1249,7 @@ static int register_dma_recv_pages(struct mlx5_core_dev *mdev, recv_buf->dma_addrs = kvcalloc(recv_buf->npages, sizeof(*recv_buf->dma_addrs), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!recv_buf->dma_addrs) return -ENOMEM; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 9feb89c6d939..7ba127d8889a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -21,8 +21,8 @@ #include "cmd.h" -/* Arbitrary to prevent userspace from consuming endless memory */ -#define MAX_MIGRATION_SIZE (512*1024*1024) +/* Device specification max LOAD size */ +#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) { @@ -73,12 +73,13 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, int ret; to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); + page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); if (!page_list) return -ENOMEM; do { - filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + page_list); if (!filled) { ret = -ENOMEM; goto err; @@ -87,7 +88,7 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, ret = sg_alloc_append_table_from_pages( &buf->table, page_list, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (ret) goto err; @@ -467,7 +468,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) size_t length; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); @@ -564,7 +565,7 @@ mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, { int ret; - if (requested_length > MAX_MIGRATION_SIZE) + if (requested_length > MAX_LOAD_SIZE) return -ENOMEM; if (vhca_buf->allocated_length < requested_length) { @@ -648,7 +649,7 @@ mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, u64 flags; vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); - if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { + if (vhca_buf->header_image_size > MAX_LOAD_SIZE) { ret = -ENOMEM; goto end; } @@ -781,7 +782,7 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) struct mlx5_vhca_data_buffer *buf; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 0886196ca8810c5b1f5097b71c4bc0df40b10208 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 8 Jan 2023 17:44:24 +0200 Subject: vfio: Use GFP_KERNEL_ACCOUNT for userspace persistent allocations Use GFP_KERNEL_ACCOUNT for userspace persistent allocations. The GFP_KERNEL_ACCOUNT option lets the memory allocator know that this is untrusted allocation triggered from userspace and should be a subject of kmem accounting, and as such it is controlled by the cgroup mechanism. The way to find the relevant allocations was for example to look at the close_device function and trace back all the kfrees to their allocations. Signed-off-by: Jason Gunthorpe Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/container.c | 2 +- drivers/vfio/pci/vfio_pci_config.c | 6 +++--- drivers/vfio/pci/vfio_pci_core.c | 7 ++++--- drivers/vfio/pci/vfio_pci_igd.c | 2 +- drivers/vfio/pci/vfio_pci_intrs.c | 10 ++++++---- drivers/vfio/pci/vfio_pci_rdwr.c | 2 +- drivers/vfio/virqfd.c | 2 +- 7 files changed, 17 insertions(+), 14 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index b7a9560ab25e..5f398c493a1b 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -367,7 +367,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) { struct vfio_container *container; - container = kzalloc(sizeof(*container), GFP_KERNEL); + container = kzalloc(sizeof(*container), GFP_KERNEL_ACCOUNT); if (!container) return -ENOMEM; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 4a350421c5f6..523e0144c86f 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1244,7 +1244,7 @@ static int vfio_msi_cap_len(struct vfio_pci_core_device *vdev, u8 pos) if (vdev->msi_perm) return len; - vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL); + vdev->msi_perm = kmalloc(sizeof(struct perm_bits), GFP_KERNEL_ACCOUNT); if (!vdev->msi_perm) return -ENOMEM; @@ -1731,11 +1731,11 @@ int vfio_config_init(struct vfio_pci_core_device *vdev) * no requirements on the length of a capability, so the gap between * capabilities needs byte granularity. */ - map = kmalloc(pdev->cfg_size, GFP_KERNEL); + map = kmalloc(pdev->cfg_size, GFP_KERNEL_ACCOUNT); if (!map) return -ENOMEM; - vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL); + vconfig = kmalloc(pdev->cfg_size, GFP_KERNEL_ACCOUNT); if (!vconfig) { kfree(map); return -ENOMEM; diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index 26a541cc64d1..a6492a25ff6a 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -144,7 +144,8 @@ static void vfio_pci_probe_mmaps(struct vfio_pci_core_device *vdev) * of the exclusive page in case that hot-add * device's bar is assigned into it. */ - dummy_res = kzalloc(sizeof(*dummy_res), GFP_KERNEL); + dummy_res = + kzalloc(sizeof(*dummy_res), GFP_KERNEL_ACCOUNT); if (dummy_res == NULL) goto no_mmap; @@ -863,7 +864,7 @@ int vfio_pci_core_register_dev_region(struct vfio_pci_core_device *vdev, region = krealloc(vdev->region, (vdev->num_regions + 1) * sizeof(*region), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!region) return -ENOMEM; @@ -1644,7 +1645,7 @@ static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev, { struct vfio_pci_mmap_vma *mmap_vma; - mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL); + mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT); if (!mmap_vma) return -ENOMEM; diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index 5e6ca5926954..dd70e2431bd7 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -180,7 +180,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_core_device *vdev) if (!addr || !(~addr)) return -ENODEV; - opregionvbt = kzalloc(sizeof(*opregionvbt), GFP_KERNEL); + opregionvbt = kzalloc(sizeof(*opregionvbt), GFP_KERNEL_ACCOUNT); if (!opregionvbt) return -ENOMEM; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 40c3d7cf163f..bffb0741518b 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -177,7 +177,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev) if (!vdev->pdev->irq) return -ENODEV; - vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); + vdev->ctx = kzalloc(sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL_ACCOUNT); if (!vdev->ctx) return -ENOMEM; @@ -216,7 +216,7 @@ static int vfio_intx_set_signal(struct vfio_pci_core_device *vdev, int fd) if (fd < 0) /* Disable only */ return 0; - vdev->ctx[0].name = kasprintf(GFP_KERNEL, "vfio-intx(%s)", + vdev->ctx[0].name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); if (!vdev->ctx[0].name) return -ENOMEM; @@ -284,7 +284,8 @@ static int vfio_msi_enable(struct vfio_pci_core_device *vdev, int nvec, bool msi if (!is_irq_none(vdev)) return -EINVAL; - vdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), GFP_KERNEL); + vdev->ctx = kcalloc(nvec, sizeof(struct vfio_pci_irq_ctx), + GFP_KERNEL_ACCOUNT); if (!vdev->ctx) return -ENOMEM; @@ -343,7 +344,8 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_core_device *vdev, if (fd < 0) return 0; - vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)", + vdev->ctx[vector].name = kasprintf(GFP_KERNEL_ACCOUNT, + "vfio-msi%s[%d](%s)", msix ? "x" : "", vector, pci_name(pdev)); if (!vdev->ctx[vector].name) diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index e352a033b4ae..e27de61ac9fe 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -470,7 +470,7 @@ int vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset, goto out_unlock; } - ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL); + ioeventfd = kzalloc(sizeof(*ioeventfd), GFP_KERNEL_ACCOUNT); if (!ioeventfd) { ret = -ENOMEM; goto out_unlock; diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c index 497a17b37865..29c564b7a6e1 100644 --- a/drivers/vfio/virqfd.c +++ b/drivers/vfio/virqfd.c @@ -112,7 +112,7 @@ int vfio_virqfd_enable(void *opaque, int ret = 0; __poll_t events; - virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL); + virqfd = kzalloc(sizeof(*virqfd), GFP_KERNEL_ACCOUNT); if (!virqfd) return -ENOMEM; -- cgit v1.2.3 From cb8285b89f2cf1db866f5c5aae5ce8db7aef41dd Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Jan 2023 17:44:25 +0200 Subject: vfio/hisi: Use GFP_KERNEL_ACCOUNT for userspace persistent allocations Use GFP_KERNEL_ACCOUNT for userspace persistent allocations. The GFP_KERNEL_ACCOUNT option lets the memory allocator know that this is untrusted allocation triggered from userspace and should be a subject of kmem accounting, and as such it is controlled by the cgroup mechanism. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-5-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0bba3b05c6c7..a117eaf21c14 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -744,7 +744,7 @@ hisi_acc_vf_pci_resume(struct hisi_acc_vf_core_device *hisi_acc_vdev) { struct hisi_acc_vf_migration_file *migf; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); @@ -863,7 +863,7 @@ hisi_acc_open_saving_migf(struct hisi_acc_vf_core_device *hisi_acc_vdev) struct hisi_acc_vf_migration_file *migf; int ret; - migf = kzalloc(sizeof(*migf), GFP_KERNEL); + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); if (!migf) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 4a6c971a06ffc1d51bd26c03051d52c2e5977878 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Jan 2023 17:44:26 +0200 Subject: vfio/fsl-mc: Use GFP_KERNEL_ACCOUNT for userspace persistent allocations Use GFP_KERNEL_ACCOUNT for userspace persistent allocations. The GFP_KERNEL_ACCOUNT option lets the memory allocator know that this is untrusted allocation triggered from userspace and should be a subject of kmem accounting, and as such it is controlled by the cgroup mechanism. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-6-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/fsl-mc/vfio_fsl_mc.c | 2 +- drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc.c b/drivers/vfio/fsl-mc/vfio_fsl_mc.c index defeb8510ace..c89a047a4cd8 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc.c @@ -28,7 +28,7 @@ static int vfio_fsl_mc_open_device(struct vfio_device *core_vdev) int i; vdev->regions = kcalloc(count, sizeof(struct vfio_fsl_mc_region), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vdev->regions) return -ENOMEM; diff --git a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c index 64d01f3fb13d..c51229fccbd6 100644 --- a/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c +++ b/drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c @@ -29,7 +29,7 @@ static int vfio_fsl_mc_irqs_allocate(struct vfio_fsl_mc_device *vdev) irq_count = mc_dev->obj_desc.irq_count; - mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL); + mc_irq = kcalloc(irq_count, sizeof(*mc_irq), GFP_KERNEL_ACCOUNT); if (!mc_irq) return -ENOMEM; @@ -77,7 +77,7 @@ static int vfio_set_trigger(struct vfio_fsl_mc_device *vdev, if (fd < 0) /* Disable only */ return 0; - irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)", + irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", hwirq, dev_name(&vdev->mc_dev->dev)); if (!irq->name) return -ENOMEM; -- cgit v1.2.3 From 7658aeda334a624634b2ed0538feee374e24e1e3 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 8 Jan 2023 17:44:27 +0200 Subject: vfio/platform: Use GFP_KERNEL_ACCOUNT for userspace persistent allocations Use GFP_KERNEL_ACCOUNT for userspace persistent allocations. The GFP_KERNEL_ACCOUNT option lets the memory allocator know that this is untrusted allocation triggered from userspace and should be a subject of kmem accounting, and as such it is controlled by the cgroup mechanism. Signed-off-by: Yishai Hadas Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230108154427.32609-7-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 2 +- drivers/vfio/platform/vfio_platform_irq.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index a9ad3f4d2613..9f27e8c30c92 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -142,7 +142,7 @@ static int vfio_platform_regions_init(struct vfio_platform_device *vdev) cnt++; vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region), - GFP_KERNEL); + GFP_KERNEL_ACCOUNT); if (!vdev->regions) return -ENOMEM; diff --git a/drivers/vfio/platform/vfio_platform_irq.c b/drivers/vfio/platform/vfio_platform_irq.c index c5b09ec0a3c9..665197caed89 100644 --- a/drivers/vfio/platform/vfio_platform_irq.c +++ b/drivers/vfio/platform/vfio_platform_irq.c @@ -186,9 +186,8 @@ static int vfio_set_trigger(struct vfio_platform_device *vdev, int index, if (fd < 0) /* Disable only */ return 0; - - irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)", - irq->hwirq, vdev->name); + irq->name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-irq[%d](%s)", + irq->hwirq, vdev->name); if (!irq->name) return -ENOMEM; @@ -286,7 +285,8 @@ int vfio_platform_irq_init(struct vfio_platform_device *vdev) while (vdev->get_irq(vdev, cnt) >= 0) cnt++; - vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq), GFP_KERNEL); + vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq), + GFP_KERNEL_ACCOUNT); if (!vdev->irqs) return -ENOMEM; -- cgit v1.2.3 From 038ef0a4765e78c1f08bace52a3f8238b9517b9c Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Sun, 29 Jan 2023 03:41:17 -0500 Subject: vfio/mdev: Use sysfs_emit() to instead of sprintf() Follow the advice of the Documentation/filesystems/sysfs.rst and show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Signed-off-by: Bo Liu Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20230129084117.2384-1-liubo03@inspur.com Signed-off-by: Alex Williamson --- drivers/vfio/mdev/mdev_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index abe3359dd477..e4490639d383 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -96,7 +96,7 @@ static MDEV_TYPE_ATTR_RO(device_api); static ssize_t name_show(struct mdev_type *mtype, struct mdev_type_attribute *attr, char *buf) { - return sprintf(buf, "%s\n", + return sysfs_emit(buf, "%s\n", mtype->pretty_name ? mtype->pretty_name : mtype->sysfs_name); } -- cgit v1.2.3 From caf094b5a156272708b46f423833392af464b664 Mon Sep 17 00:00:00 2001 From: Shay Drory Date: Tue, 24 Jan 2023 16:49:53 +0200 Subject: vfio/mlx5: Check whether VF is migratable Add a check whether VF is migratable. Only if VF is migratable, mark the VFIO device as migration capable. Signed-off-by: Shay Drory Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20230124144955.139901-2-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 27 +++++++++++++++++++++++++++ drivers/vfio/pci/mlx5/cmd.h | 1 + 2 files changed, 28 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 0586f09c69af..e956e79626b7 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -7,6 +7,29 @@ enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 }; +static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id) +{ + int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out); + void *query_cap = NULL, *cap; + int ret; + + query_cap = kzalloc(query_sz, GFP_KERNEL); + if (!query_cap) + return -ENOMEM; + + ret = mlx5_vport_get_other_func_cap(mdev, func_id, query_cap, + MLX5_CAP_GENERAL_2); + if (ret) + goto out; + + cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability); + if (!MLX5_GET(cmd_hca_cap_2, cap, migratable)) + ret = -EOPNOTSUPP; +out: + kfree(query_cap); + return ret; +} + static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, u16 *vhca_id); static void @@ -195,6 +218,10 @@ void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev, if (mvdev->vf_id < 0) goto end; + ret = mlx5vf_is_migratable(mvdev->mdev, mvdev->vf_id + 1); + if (ret) + goto end; + if (mlx5vf_cmd_get_vhca_id(mvdev->mdev, mvdev->vf_id + 1, &mvdev->vhca_id)) goto end; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 5483171d57ad..657d94affe2b 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From b04e2e86e919633825728007484508dbdc44c7bb Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 24 Jan 2023 16:49:54 +0200 Subject: vfio/mlx5: Improve the source side flow upon pre_copy Improve the source side flow upon pre_copy as of below. - Prepare the stop_copy buffers as part of moving to pre_copy. - Send to the target a record that includes the expected stop_copy size to let it optimize its stop_copy flow as well. As for sending the target this new record type (i.e. MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE) we split the current 64 header flags bits into 32 flags bits and another 32 tag bits, each record may have a tag and a flag whether it's optional or mandatory. Optional records will be ignored in the target. The above reduces the downtime upon stop_copy as the relevant data stuff is prepared ahead as part of pre_copy. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20230124144955.139901-3-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 31 +++++++--- drivers/vfio/pci/mlx5/cmd.h | 21 ++++++- drivers/vfio/pci/mlx5/main.c | 133 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 151 insertions(+), 34 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index e956e79626b7..5161d845c478 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -500,7 +500,7 @@ void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work) } static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, - size_t image_size) + size_t image_size, bool initial_pre_copy) { struct mlx5_vf_migration_file *migf = header_buf->migf; struct mlx5_vf_migration_header header = {}; @@ -508,7 +508,9 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, struct page *page; u8 *to_buff; - header.image_size = cpu_to_le64(image_size); + header.record_size = cpu_to_le64(image_size); + header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY); + header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA); page = mlx5vf_get_migration_page(header_buf, 0); if (!page) return -EINVAL; @@ -516,12 +518,13 @@ static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf, memcpy(to_buff, &header, sizeof(header)); kunmap_local(to_buff); header_buf->length = sizeof(header); - header_buf->header_image_size = image_size; header_buf->start_pos = header_buf->migf->max_pos; migf->max_pos += header_buf->length; spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&header_buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += sizeof(header); return 0; } @@ -535,11 +538,14 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) if (!status) { size_t image_size; unsigned long flags; + bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY && + !async_data->last_chunk; image_size = MLX5_GET(save_vhca_state_out, async_data->out, actual_image_size); if (async_data->header_buf) { - status = add_buf_header(async_data->header_buf, image_size); + status = add_buf_header(async_data->header_buf, image_size, + initial_pre_copy); if (status) goto err; } @@ -549,6 +555,8 @@ static void mlx5vf_save_callback(int status, struct mlx5_async_work *context) spin_lock_irqsave(&migf->list_lock, flags); list_add_tail(&async_data->buf->buf_elm, &migf->buf_list); spin_unlock_irqrestore(&migf->list_lock, flags); + if (initial_pre_copy) + migf->pre_copy_initial_bytes += image_size; migf->state = async_data->last_chunk ? MLX5_MIGF_STATE_COMPLETE : MLX5_MIGF_STATE_PRE_COPY; wake_up_interruptible(&migf->poll_wait); @@ -610,11 +618,16 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); - if (IS_ERR(header_buf)) { - err = PTR_ERR(header_buf); - goto err_free; + if (async_data->last_chunk && migf->buf_header) { + header_buf = migf->buf_header; + migf->buf_header = NULL; + } else { + header_buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(header_buf)) { + err = PTR_ERR(header_buf); + goto err_free; + } } } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 657d94affe2b..8f1bef580028 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -32,10 +32,26 @@ enum mlx5_vf_load_state { MLX5_VF_LOAD_STATE_LOAD_IMAGE, }; +struct mlx5_vf_migration_tag_stop_copy_data { + __le64 stop_copy_size; +}; + +enum mlx5_vf_migf_header_flags { + MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0, + MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0, +}; + +enum mlx5_vf_migf_header_tag { + MLX5_MIGF_HEADER_TAG_FW_DATA = 0, + MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE = 1 << 0, +}; + struct mlx5_vf_migration_header { - __le64 image_size; + __le64 record_size; /* For future use in case we may need to change the kernel protocol */ - __le64 flags; + __le32 flags; /* Use mlx5_vf_migf_header_flags */ + __le32 tag; /* Use mlx5_vf_migf_header_tag */ + __u8 data[]; /* Its size is given in the record_size */ }; struct mlx5_vhca_data_buffer { @@ -73,6 +89,7 @@ struct mlx5_vf_migration_file { enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; + u64 pre_copy_initial_bytes; struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *buf_header; spinlock_t list_lock; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ba127d8889a..6856e7b97533 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -304,6 +304,87 @@ static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) wake_up_interruptible(&migf->poll_wait); } +static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf) +{ + size_t size = sizeof(struct mlx5_vf_migration_header) + + sizeof(struct mlx5_vf_migration_tag_stop_copy_data); + struct mlx5_vf_migration_tag_stop_copy_data data = {}; + struct mlx5_vhca_data_buffer *header_buf = NULL; + struct mlx5_vf_migration_header header = {}; + unsigned long flags; + struct page *page; + u8 *to_buff; + int ret; + + header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + if (IS_ERR(header_buf)) + return PTR_ERR(header_buf); + + header.record_size = cpu_to_le64(sizeof(data)); + header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); + header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); + page = mlx5vf_get_migration_page(header_buf, 0); + if (!page) { + ret = -EINVAL; + goto err; + } + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + header_buf->length = sizeof(header); + data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length); + memcpy(to_buff + sizeof(header), &data, sizeof(data)); + header_buf->length += sizeof(data); + kunmap_local(to_buff); + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irqsave(&migf->list_lock, flags); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irqrestore(&migf->list_lock, flags); + migf->pre_copy_initial_bytes = size; + return 0; +err: + mlx5vf_put_data_buffer(header_buf); + return ret; +} + +static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf, + size_t state_size) +{ + struct mlx5_vhca_data_buffer *buf; + size_t inc_state_size; + int ret; + + /* let's be ready for stop_copy size that might grow by 10 percents */ + if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) + inc_state_size = state_size; + + buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + migf->buf = buf; + buf = mlx5vf_get_data_buffer(migf, + sizeof(struct mlx5_vf_migration_header), DMA_NONE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } + + migf->buf_header = buf; + ret = mlx5vf_add_stop_copy_header(migf); + if (ret) + goto err_header; + return 0; + +err_header: + mlx5vf_put_data_buffer(migf->buf_header); + migf->buf_header = NULL; +err: + mlx5vf_put_data_buffer(migf->buf); + migf->buf = NULL; + return ret; +} + static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -314,7 +395,7 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, loff_t *pos = &filp->f_pos; unsigned long minsz; size_t inc_length = 0; - bool end_of_data; + bool end_of_data = false; int ret; if (cmd != VFIO_MIG_GET_PRECOPY_INFO) @@ -358,25 +439,19 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, goto err_migf_unlock; } - buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); - if (buf) { - if (buf->start_pos == 0) { - info.initial_bytes = buf->header_image_size - *pos; - } else if (buf->start_pos == - sizeof(struct mlx5_vf_migration_header)) { - /* First data buffer following the header */ - info.initial_bytes = buf->start_pos + - buf->length - *pos; - } else { - info.dirty_bytes = buf->start_pos + buf->length - *pos; - } + if (migf->pre_copy_initial_bytes > *pos) { + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; } else { - if (!end_of_data) { - ret = -EINVAL; - goto err_migf_unlock; + buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (buf) { + info.dirty_bytes = buf->start_pos + buf->length - *pos; + } else { + if (!end_of_data) { + ret = -EINVAL; + goto err_migf_unlock; + } + info.dirty_bytes = inc_length; } - - info.dirty_bytes = inc_length; } if (!end_of_data || !inc_length) { @@ -441,10 +516,16 @@ static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) if (ret) goto err; - buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); - if (IS_ERR(buf)) { - ret = PTR_ERR(buf); - goto err; + /* Checking whether we have a matching pre-allocated buffer that can fit */ + if (migf->buf && migf->buf->allocated_length >= length) { + buf = migf->buf; + migf->buf = NULL; + } else { + buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto err; + } } ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); @@ -503,6 +584,12 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (ret) goto out_pd; + if (track) { + ret = mlx5vf_prep_stop_copy(migf, length); + if (ret) + goto out_pd; + } + buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); @@ -516,7 +603,7 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) out_save: mlx5vf_free_data_buffer(buf); out_pd: - mlx5vf_cmd_dealloc_pd(migf); + mlx5fv_cmd_clean_migf_resources(migf); out_free: fput(migf->filp); end: -- cgit v1.2.3 From f4f0c25e5d726e353369258b5ce28a01bd71aa47 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 24 Jan 2023 16:49:55 +0200 Subject: vfio/mlx5: Improve the target side flow to reduce downtime Improve the target side flow to reduce downtime as of below. - Support reading an optional record which includes the expected stop_copy size. - Once the source sends this record data, which expects to be sent as part of the pre_copy flow, prepare the data buffers that may be large enough to hold the final stop_copy data. The above reduces the migration downtime as the relevant stuff that is needed to load the image data is prepared ahead as part of pre_copy. Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20230124144955.139901-4-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.h | 6 ++- drivers/vfio/pci/mlx5/main.c | 111 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 105 insertions(+), 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 8f1bef580028..aec4c69dd6c1 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -27,6 +27,8 @@ enum mlx5_vf_migf_state { enum mlx5_vf_load_state { MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER, MLX5_VF_LOAD_STATE_READ_HEADER, + MLX5_VF_LOAD_STATE_PREP_HEADER_DATA, + MLX5_VF_LOAD_STATE_READ_HEADER_DATA, MLX5_VF_LOAD_STATE_PREP_IMAGE, MLX5_VF_LOAD_STATE_READ_IMAGE, MLX5_VF_LOAD_STATE_LOAD_IMAGE, @@ -59,7 +61,6 @@ struct mlx5_vhca_data_buffer { loff_t start_pos; u64 length; u64 allocated_length; - u64 header_image_size; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -89,6 +90,9 @@ struct mlx5_vf_migration_file { enum mlx5_vf_load_state load_state; u32 pdn; loff_t max_pos; + u64 record_size; + u32 record_tag; + u64 stop_copy_prep_size; u64 pre_copy_initial_bytes; struct mlx5_vhca_data_buffer *buf; struct mlx5_vhca_data_buffer *buf_header; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 6856e7b97533..e897537a9e8a 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -703,6 +703,56 @@ mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, return 0; } +static int +mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf, + struct mlx5_vhca_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + size_t copy_len, to_copy; + size_t required_data; + u8 *to_buff; + int ret; + + required_data = migf->record_size - vhca_buf->length; + to_copy = min_t(size_t, *len, required_data); + copy_len = to_copy; + while (to_copy) { + ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, + done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == migf->record_size) { + switch (migf->record_tag) { + case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: + { + struct page *page; + + page = mlx5vf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + migf->stop_copy_prep_size = min_t(u64, + le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); + kunmap_local(to_buff); + break; + } + default: + /* Optional tag */ + break; + } + + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; + migf->max_pos += migf->record_size; + vhca_buf->length = 0; + } + + return 0; +} + static int mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, struct mlx5_vhca_data_buffer *vhca_buf, @@ -733,23 +783,38 @@ mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, *len -= copy_len; vhca_buf->length += copy_len; if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { - u64 flags; + u64 record_size; + u32 flags; - vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); - if (vhca_buf->header_image_size > MAX_LOAD_SIZE) { + record_size = le64_to_cpup((__le64 *)to_buff); + if (record_size > MAX_LOAD_SIZE) { ret = -ENOMEM; goto end; } - flags = le64_to_cpup((__le64 *)(to_buff + + migf->record_size = record_size; + flags = le32_to_cpup((__le32 *)(to_buff + offsetof(struct mlx5_vf_migration_header, flags))); - if (flags) { - ret = -EOPNOTSUPP; - goto end; + migf->record_tag = le32_to_cpup((__le32 *)(to_buff + + offsetof(struct mlx5_vf_migration_header, tag))); + switch (migf->record_tag) { + case MLX5_MIGF_HEADER_TAG_FW_DATA: + migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; + break; + case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: + migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; + break; + default: + if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { + ret = -EOPNOTSUPP; + goto end; + } + /* We may read and skip this optional record data */ + migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; } - migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; migf->max_pos += vhca_buf->length; + vhca_buf->length = 0; *has_work = true; } end: @@ -793,9 +858,34 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, if (ret) goto out_unlock; break; + case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: + if (vhca_buf_header->allocated_length < migf->record_size) { + mlx5vf_free_data_buffer(vhca_buf_header); + + migf->buf_header = mlx5vf_alloc_data_buffer(migf, + migf->record_size, DMA_NONE); + if (IS_ERR(migf->buf_header)) { + ret = PTR_ERR(migf->buf_header); + migf->buf_header = NULL; + goto out_unlock; + } + + vhca_buf_header = migf->buf_header; + } + + vhca_buf_header->start_pos = migf->max_pos; + migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; + break; + case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: + ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; case MLX5_VF_LOAD_STATE_PREP_IMAGE: { - u64 size = vhca_buf_header->header_image_size; + u64 size = max(migf->record_size, + migf->stop_copy_prep_size); if (vhca_buf->allocated_length < size) { mlx5vf_free_data_buffer(vhca_buf); @@ -824,7 +914,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, break; case MLX5_VF_LOAD_STATE_READ_IMAGE: ret = mlx5vf_resume_read_image(migf, vhca_buf, - vhca_buf_header->header_image_size, + migf->record_size, &buf, &len, pos, &done, &has_work); if (ret) goto out_unlock; @@ -837,7 +927,6 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, /* prep header buf for next image */ vhca_buf_header->length = 0; - vhca_buf_header->header_image_size = 0; /* prep data buf for next image */ vhca_buf->length = 0; -- cgit v1.2.3 From 168a9c91fe0a1180959b6394f4566de7080244b6 Mon Sep 17 00:00:00 2001 From: Tomasz Duszynski Date: Tue, 31 Jan 2023 09:33:49 +0100 Subject: vfio: platform: ignore missing reset if disabled at module init If reset requirement was relaxed via module parameter, errors caused by missing reset should not be propagated down to the vfio core. Otherwise initialization will fail. Signed-off-by: Tomasz Duszynski Fixes: 5f6c7e0831a1 ("vfio/platform: Use the new device life cycle helpers") Reviewed-by: Kevin Tian Reviewed-by: Eric Auger Link: https://lore.kernel.org/r/20230131083349.2027189-1-tduszynski@marvell.com Signed-off-by: Alex Williamson --- drivers/vfio/platform/vfio_platform_common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 9f27e8c30c92..e53757d1d095 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -647,10 +647,13 @@ int vfio_platform_init_common(struct vfio_platform_device *vdev) mutex_init(&vdev->igate); ret = vfio_platform_get_reset(vdev); - if (ret && vdev->reset_required) + if (ret && vdev->reset_required) { dev_err(dev, "No reset function found for device %s\n", vdev->name); - return ret; + return ret; + } + + return 0; } EXPORT_SYMBOL_GPL(vfio_platform_init_common); -- cgit v1.2.3 From ef3a3f6a294ba65fd906a291553935881796f8a5 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:03 -0800 Subject: vfio/type1: exclude mdevs from VFIO_UPDATE_VADDR Disable the VFIO_UPDATE_VADDR capability if mediated devices are present. Their kernel threads could be blocked indefinitely by a misbehaving userland while trying to pin/unpin pages while vaddrs are being updated. Do not allow groups to be added to the container while vaddr's are invalid, so we never need to block user threads from pinning, and can delete the vaddr-waiting code in a subsequent patch. Fixes: c3cbab24db38 ("vfio/type1: implement interfaces to update vaddr") Cc: stable@vger.kernel.org Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-2-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 44 +++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/vfio.h | 15 ++++++++------ 2 files changed, 51 insertions(+), 8 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 23c24fe98c00..144f5bb20fb8 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -861,6 +861,12 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, mutex_lock(&iommu->lock); + if (WARN_ONCE(iommu->vaddr_invalid_count, + "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) { + ret = -EBUSY; + goto pin_done; + } + /* * Wait for all necessary vaddr's to be valid so they can be used in * the main loop without dropping the lock, to avoid racing vs unmap. @@ -1343,6 +1349,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); + /* Cannot update vaddr if mdev is present. */ + if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) { + ret = -EBUSY; + goto unlock; + } + pgshift = __ffs(iommu->pgsize_bitmap); pgsize = (size_t)1 << pgshift; @@ -2185,11 +2197,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_domain_geometry *geo; LIST_HEAD(iova_copy); LIST_HEAD(group_resv_regions); - int ret = -EINVAL; + int ret = -EBUSY; mutex_lock(&iommu->lock); + /* Attach could require pinning, so disallow while vaddr is invalid. */ + if (iommu->vaddr_invalid_count) + goto out_unlock; + /* Check for duplicates */ + ret = -EINVAL; if (vfio_iommu_find_iommu_group(iommu, iommu_group)) goto out_unlock; @@ -2660,6 +2677,16 @@ static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu) return ret; } +static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu) +{ + bool ret; + + mutex_lock(&iommu->lock); + ret = !list_empty(&iommu->emulated_iommu_groups); + mutex_unlock(&iommu->lock); + return ret; +} + static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, unsigned long arg) { @@ -2668,8 +2695,13 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu, case VFIO_TYPE1v2_IOMMU: case VFIO_TYPE1_NESTING_IOMMU: case VFIO_UNMAP_ALL: - case VFIO_UPDATE_VADDR: return 1; + case VFIO_UPDATE_VADDR: + /* + * Disable this feature if mdevs are present. They cannot + * safely pin/unpin/rw while vaddrs are being updated. + */ + return iommu && !vfio_iommu_has_emulated(iommu); case VFIO_DMA_CC_IOMMU: if (!iommu) return 0; @@ -3138,6 +3170,13 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, size_t done; mutex_lock(&iommu->lock); + + if (WARN_ONCE(iommu->vaddr_invalid_count, + "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) { + ret = -EBUSY; + goto out; + } + while (count > 0) { ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data, count, write, &done); @@ -3149,6 +3188,7 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova, user_iova += done; } +out: mutex_unlock(&iommu->lock); return ret; } diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 23105eb036fa..0552e8dcf0cb 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -49,7 +49,11 @@ /* Supports VFIO_DMA_UNMAP_FLAG_ALL */ #define VFIO_UNMAP_ALL 9 -/* Supports the vaddr flag for DMA map and unmap */ +/* + * Supports the vaddr flag for DMA map and unmap. Not supported for mediated + * devices, so this capability is subject to change as groups are added or + * removed. + */ #define VFIO_UPDATE_VADDR 10 /* @@ -1343,8 +1347,7 @@ struct vfio_iommu_type1_info_dma_avail { * Map process virtual addresses to IO virtual addresses using the * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required. * - * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova, and - * unblock translation of host virtual addresses in the iova range. The vaddr + * If flags & VFIO_DMA_MAP_FLAG_VADDR, update the base vaddr for iova. The vaddr * must have previously been invalidated with VFIO_DMA_UNMAP_FLAG_VADDR. To * maintain memory consistency within the user application, the updated vaddr * must address the same memory object as originally mapped. Failure to do so @@ -1395,9 +1398,9 @@ struct vfio_bitmap { * must be 0. This cannot be combined with the get-dirty-bitmap flag. * * If flags & VFIO_DMA_UNMAP_FLAG_VADDR, do not unmap, but invalidate host - * virtual addresses in the iova range. Tasks that attempt to translate an - * iova's vaddr will block. DMA to already-mapped pages continues. This - * cannot be combined with the get-dirty-bitmap flag. + * virtual addresses in the iova range. DMA to already-mapped pages continues. + * Groups may not be added to the container while any addresses are invalid. + * This cannot be combined with the get-dirty-bitmap flag. */ struct vfio_iommu_type1_dma_unmap { __u32 argsz; -- cgit v1.2.3 From 046eca5018f8a5dd1dc2cedf87fb5843b9ea3026 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:04 -0800 Subject: vfio/type1: prevent underflow of locked_vm via exec() When a vfio container is preserved across exec, the task does not change, but it gets a new mm with locked_vm=0, and loses the count from existing dma mappings. If the user later unmaps a dma mapping, locked_vm underflows to a large unsigned value, and a subsequent dma map request fails with ENOMEM in __account_locked_vm. To avoid underflow, grab and save the mm at the time a dma is mapped. Use that mm when adjusting locked_vm, rather than re-acquiring the saved task's mm, which may have changed. If the saved mm is dead, do nothing. locked_vm is incremented for existing mappings in a subsequent patch. Fixes: 73fa0d10d077 ("vfio: Type1 IOMMU implementation") Cc: stable@vger.kernel.org Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-3-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 144f5bb20fb8..6b757d035457 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -100,6 +100,7 @@ struct vfio_dma { struct task_struct *task; struct rb_root pfn_list; /* Ex-user pinned pfn list */ unsigned long *bitmap; + struct mm_struct *mm; }; struct vfio_batch { @@ -420,8 +421,8 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) if (!npage) return 0; - mm = async ? get_task_mm(dma->task) : dma->task->mm; - if (!mm) + mm = dma->mm; + if (async && !mmget_not_zero(mm)) return -ESRCH; /* process exited */ ret = mmap_write_lock_killable(mm); @@ -794,8 +795,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, struct mm_struct *mm; int ret; - mm = get_task_mm(dma->task); - if (!mm) + mm = dma->mm; + if (!mmget_not_zero(mm)) return -ENODEV; ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages); @@ -805,7 +806,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, ret = 0; if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { - ret = vfio_lock_acct(dma, 1, true); + ret = vfio_lock_acct(dma, 1, false); if (ret) { put_pfn(*pfn_base, dma->prot); if (ret == -ENOMEM) @@ -1180,6 +1181,7 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) vfio_unmap_unpin(iommu, dma, true); vfio_unlink_dma(iommu, dma); put_task_struct(dma->task); + mmdrop(dma->mm); vfio_dma_bitmap_free(dma); if (dma->vaddr_invalid) { iommu->vaddr_invalid_count--; @@ -1664,29 +1666,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, * against the locked memory limit and we need to be able to do both * outside of this call path as pinning can be asynchronous via the * external interfaces for mdev devices. RLIMIT_MEMLOCK requires a - * task_struct and VM locked pages requires an mm_struct, however - * holding an indefinite mm reference is not recommended, therefore we - * only hold a reference to a task. We could hold a reference to - * current, however QEMU uses this call path through vCPU threads, - * which can be killed resulting in a NULL mm and failure in the unmap - * path when called via a different thread. Avoid this problem by - * using the group_leader as threads within the same group require - * both CLONE_THREAD and CLONE_VM and will therefore use the same - * mm_struct. - * - * Previously we also used the task for testing CAP_IPC_LOCK at the - * time of pinning and accounting, however has_capability() makes use - * of real_cred, a copy-on-write field, so we can't guarantee that it - * matches group_leader, or in fact that it might not change by the - * time it's evaluated. If a process were to call MAP_DMA with - * CAP_IPC_LOCK but later drop it, it doesn't make sense that they - * possibly see different results for an iommu_mapped vfio_dma vs - * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the - * time of calling MAP_DMA. + * task_struct. Save the group_leader so that all DMA tracking uses + * the same task, to make debugging easier. VM locked pages requires + * an mm_struct, so grab the mm in case the task dies. */ get_task_struct(current->group_leader); dma->task = current->group_leader; dma->lock_cap = capable(CAP_IPC_LOCK); + dma->mm = current->mm; + mmgrab(dma->mm); dma->pfn_list = RB_ROOT; @@ -3122,9 +3110,8 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, !(dma->prot & IOMMU_READ)) return -EPERM; - mm = get_task_mm(dma->task); - - if (!mm) + mm = dma->mm; + if (!mmget_not_zero(mm)) return -EPERM; if (kthread) -- cgit v1.2.3 From 18e292705ba21cc9b3227b9ad5b1c28973605ee5 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:05 -0800 Subject: vfio/type1: track locked_vm per dma Track locked_vm per dma struct, and create a new subroutine, both for use in a subsequent patch. No functional change. Fixes: c3cbab24db38 ("vfio/type1: implement interfaces to update vaddr") Cc: stable@vger.kernel.org Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-4-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 6b757d035457..372d91b6a2f6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -101,6 +101,7 @@ struct vfio_dma { struct rb_root pfn_list; /* Ex-user pinned pfn list */ unsigned long *bitmap; struct mm_struct *mm; + size_t locked_vm; }; struct vfio_batch { @@ -413,6 +414,19 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) return ret; } +static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm, + bool lock_cap, long npage) +{ + int ret = mmap_write_lock_killable(mm); + + if (ret) + return ret; + + ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap); + mmap_write_unlock(mm); + return ret; +} + static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) { struct mm_struct *mm; @@ -425,12 +439,9 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async) if (async && !mmget_not_zero(mm)) return -ESRCH; /* process exited */ - ret = mmap_write_lock_killable(mm); - if (!ret) { - ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task, - dma->lock_cap); - mmap_write_unlock(mm); - } + ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage); + if (!ret) + dma->locked_vm += npage; if (async) mmput(mm); -- cgit v1.2.3 From 90fdd158a695d70403163f9a0e4efc5b20f3fd3e Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:06 -0800 Subject: vfio/type1: restore locked_vm When a vfio container is preserved across exec or fork-exec, the new task's mm has a locked_vm count of 0. After a dma vaddr is updated using VFIO_DMA_MAP_FLAG_VADDR, locked_vm remains 0, and the pinned memory does not count against the task's RLIMIT_MEMLOCK. To restore the correct locked_vm count, when VFIO_DMA_MAP_FLAG_VADDR is used and the dma's mm has changed, add the dma's locked_vm count to the new mm->locked_vm, subject to the rlimit, and subtract it from the old mm->locked_vm. Fixes: c3cbab24db38 ("vfio/type1: implement interfaces to update vaddr") Cc: stable@vger.kernel.org Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-5-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 372d91b6a2f6..b571cab5c9c7 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1591,6 +1591,38 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu, return list_empty(iova); } +static int vfio_change_dma_owner(struct vfio_dma *dma) +{ + struct task_struct *task = current->group_leader; + struct mm_struct *mm = current->mm; + long npage = dma->locked_vm; + bool lock_cap; + int ret; + + if (mm == dma->mm) + return 0; + + lock_cap = capable(CAP_IPC_LOCK); + ret = mm_lock_acct(task, mm, lock_cap, npage); + if (ret) + return ret; + + if (mmget_not_zero(dma->mm)) { + mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage); + mmput(dma->mm); + } + + if (dma->task != task) { + put_task_struct(dma->task); + dma->task = get_task_struct(task); + } + mmdrop(dma->mm); + dma->mm = mm; + mmgrab(dma->mm); + dma->lock_cap = lock_cap; + return 0; +} + static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { @@ -1640,6 +1672,9 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, dma->size != size) { ret = -EINVAL; } else { + ret = vfio_change_dma_owner(dma); + if (ret) + goto out_unlock; dma->vaddr = vaddr; dma->vaddr_invalid = false; iommu->vaddr_invalid_count--; -- cgit v1.2.3 From da4f1c2e1c9cb2832b5a4de3472cd1cad63781b2 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:07 -0800 Subject: vfio/type1: revert "block on invalid vaddr" Revert this dead code: commit 898b9eaeb3fe ("vfio/type1: block on invalid vaddr") Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-6-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 94 +++-------------------------------------- 1 file changed, 5 insertions(+), 89 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index b571cab5c9c7..fb29106d09c7 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -72,7 +72,6 @@ struct vfio_iommu { unsigned int vaddr_invalid_count; uint64_t pgsize_bitmap; uint64_t num_non_pinned_groups; - wait_queue_head_t vaddr_wait; bool v2; bool nesting; bool dirty_page_tracking; @@ -154,8 +153,6 @@ struct vfio_regions { #define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX) #define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX) -#define WAITED 1 - static int put_pfn(unsigned long pfn, int prot); static struct vfio_iommu_group* @@ -607,61 +604,6 @@ done: return ret; } -static int vfio_wait(struct vfio_iommu *iommu) -{ - DEFINE_WAIT(wait); - - prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE); - mutex_unlock(&iommu->lock); - schedule(); - mutex_lock(&iommu->lock); - finish_wait(&iommu->vaddr_wait, &wait); - if (kthread_should_stop() || !iommu->container_open || - fatal_signal_pending(current)) { - return -EFAULT; - } - return WAITED; -} - -/* - * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped - * if the task waits, but is re-locked on return. Return result in *dma_p. - * Return 0 on success with no waiting, WAITED on success if waited, and -errno - * on error. - */ -static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start, - size_t size, struct vfio_dma **dma_p) -{ - int ret = 0; - - do { - *dma_p = vfio_find_dma(iommu, start, size); - if (!*dma_p) - return -EINVAL; - else if (!(*dma_p)->vaddr_invalid) - return ret; - else - ret = vfio_wait(iommu); - } while (ret == WAITED); - - return ret; -} - -/* - * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped - * if the task waits, but is re-locked on return. Return 0 on success with no - * waiting, WAITED on success if waited, and -errno on error. - */ -static int vfio_wait_all_valid(struct vfio_iommu *iommu) -{ - int ret = 0; - - while (iommu->vaddr_invalid_count && ret >= 0) - ret = vfio_wait(iommu); - - return ret; -} - /* * Attempt to pin pages. We really don't want to track all the pfns and * the iommu can only map chunks of consecutive pfns anyway, so get the @@ -862,7 +804,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, unsigned long remote_vaddr; struct vfio_dma *dma; bool do_accounting; - dma_addr_t iova; if (!iommu || !pages) return -EINVAL; @@ -879,22 +820,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, goto pin_done; } - /* - * Wait for all necessary vaddr's to be valid so they can be used in - * the main loop without dropping the lock, to avoid racing vs unmap. - */ -again: - if (iommu->vaddr_invalid_count) { - for (i = 0; i < npage; i++) { - iova = user_iova + PAGE_SIZE * i; - ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma); - if (ret < 0) - goto pin_done; - if (ret == WAITED) - goto again; - } - } - /* Fail if no dma_umap notifier is registered */ if (list_empty(&iommu->device_list)) { ret = -EINVAL; @@ -910,6 +835,7 @@ again: for (i = 0; i < npage; i++) { unsigned long phys_pfn; + dma_addr_t iova; struct vfio_pfn *vpfn; iova = user_iova + PAGE_SIZE * i; @@ -1194,10 +1120,8 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) put_task_struct(dma->task); mmdrop(dma->mm); vfio_dma_bitmap_free(dma); - if (dma->vaddr_invalid) { + if (dma->vaddr_invalid) iommu->vaddr_invalid_count--; - wake_up_all(&iommu->vaddr_wait); - } kfree(dma); iommu->dma_avail++; } @@ -1678,7 +1602,6 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, dma->vaddr = vaddr; dma->vaddr_invalid = false; iommu->vaddr_invalid_count--; - wake_up_all(&iommu->vaddr_wait); } goto out_unlock; } else if (dma) { @@ -1753,10 +1676,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; int ret; - ret = vfio_wait_all_valid(iommu); - if (ret < 0) - return ret; - /* Arbitrarily pick the first domain in the list for lookups */ if (!list_empty(&iommu->domain_list)) d = list_first_entry(&iommu->domain_list, @@ -2647,7 +2566,6 @@ static void *vfio_iommu_type1_open(unsigned long arg) mutex_init(&iommu->lock); mutex_init(&iommu->device_list_lock); INIT_LIST_HEAD(&iommu->device_list); - init_waitqueue_head(&iommu->vaddr_wait); iommu->pgsize_bitmap = PAGE_MASK; INIT_LIST_HEAD(&iommu->emulated_iommu_groups); @@ -3144,13 +3062,12 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu, struct vfio_dma *dma; bool kthread = current->mm == NULL; size_t offset; - int ret; *copied = 0; - ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma); - if (ret < 0) - return ret; + dma = vfio_find_dma(iommu, user_iova, 1); + if (!dma) + return -EINVAL; if ((write && !(dma->prot & IOMMU_WRITE)) || !(dma->prot & IOMMU_READ)) @@ -3259,7 +3176,6 @@ static void vfio_iommu_type1_notify(void *iommu_data, mutex_lock(&iommu->lock); iommu->container_open = false; mutex_unlock(&iommu->lock); - wake_up_all(&iommu->vaddr_wait); } static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { -- cgit v1.2.3 From a5ac1f816563503eb25739ba8cc3d74548169f09 Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:08 -0800 Subject: vfio/type1: revert "implement notify callback" This is dead code. Revert it. commit 487ace134053 ("vfio/type1: implement notify callback") Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index fb29106d09c7..29616f20c778 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -75,7 +75,6 @@ struct vfio_iommu { bool v2; bool nesting; bool dirty_page_tracking; - bool container_open; struct list_head emulated_iommu_groups; }; @@ -2562,7 +2561,6 @@ static void *vfio_iommu_type1_open(unsigned long arg) INIT_LIST_HEAD(&iommu->iova_list); iommu->dma_list = RB_ROOT; iommu->dma_avail = dma_entry_limit; - iommu->container_open = true; mutex_init(&iommu->lock); mutex_init(&iommu->device_list_lock); INIT_LIST_HEAD(&iommu->device_list); @@ -3166,18 +3164,6 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data, return domain; } -static void vfio_iommu_type1_notify(void *iommu_data, - enum vfio_iommu_notify_type event) -{ - struct vfio_iommu *iommu = iommu_data; - - if (event != VFIO_IOMMU_CONTAINER_CLOSE) - return; - mutex_lock(&iommu->lock); - iommu->container_open = false; - mutex_unlock(&iommu->lock); -} - static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { .name = "vfio-iommu-type1", .owner = THIS_MODULE, @@ -3192,7 +3178,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { .unregister_device = vfio_iommu_type1_unregister_device, .dma_rw = vfio_iommu_type1_dma_rw, .group_iommu_domain = vfio_iommu_type1_group_iommu_domain, - .notify = vfio_iommu_type1_notify, }; static int __init vfio_iommu_type1_init(void) -- cgit v1.2.3 From e592296cd6e15ddeebe4c8411365c550da65c8bf Mon Sep 17 00:00:00 2001 From: Steve Sistare Date: Tue, 31 Jan 2023 08:58:09 -0800 Subject: vfio: revert "iommu driver notify callback" Revert this dead code: commit ec5e32940cc9 ("vfio: iommu driver notify callback") Signed-off-by: Steve Sistare Reviewed-by: Kevin Tian Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/1675184289-267876-8-git-send-email-steven.sistare@oracle.com Signed-off-by: Alex Williamson --- drivers/vfio/container.c | 5 ----- drivers/vfio/vfio.h | 7 ------- 2 files changed, 12 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/container.c b/drivers/vfio/container.c index 5f398c493a1b..95fd51817f2b 100644 --- a/drivers/vfio/container.c +++ b/drivers/vfio/container.c @@ -383,11 +383,6 @@ static int vfio_fops_open(struct inode *inode, struct file *filep) static int vfio_fops_release(struct inode *inode, struct file *filep) { struct vfio_container *container = filep->private_data; - struct vfio_iommu_driver *driver = container->iommu_driver; - - if (driver && driver->ops->notify) - driver->ops->notify(container->iommu_data, - VFIO_IOMMU_CONTAINER_CLOSE); filep->private_data = NULL; diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index f8219a438bfb..d5fa896b5a85 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -89,11 +89,6 @@ int __init vfio_group_init(void); void vfio_group_cleanup(void); #if IS_ENABLED(CONFIG_VFIO_CONTAINER) -/* events for the backend driver notify callback */ -enum vfio_iommu_notify_type { - VFIO_IOMMU_CONTAINER_CLOSE = 0, -}; - /** * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks */ @@ -124,8 +119,6 @@ struct vfio_iommu_driver_ops { void *data, size_t count, bool write); struct iommu_domain *(*group_iommu_domain)(void *iommu_data, struct iommu_group *group); - void (*notify)(void *iommu_data, - enum vfio_iommu_notify_type event); }; struct vfio_iommu_driver { -- cgit v1.2.3 From 2b48f52f2bff8e8926165983f3a3d7b89b33de08 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Fri, 3 Feb 2023 16:50:26 -0500 Subject: vfio: fix deadlock between group lock and kvm lock After 51cdc8bc120e, we have another deadlock scenario between the kvm->lock and the vfio group_lock with two different codepaths acquiring the locks in different order. Specifically in vfio_open_device, vfio holds the vfio group_lock when issuing device->ops->open_device but some drivers (like vfio-ap) need to acquire kvm->lock during their open_device routine; Meanwhile, kvm_vfio_release will acquire the kvm->lock first before calling vfio_file_set_kvm which will acquire the vfio group_lock. To resolve this, let's remove the need for the vfio group_lock from the kvm_vfio_release codepath. This is done by introducing a new spinlock to protect modifications to the vfio group kvm pointer, and acquiring a kvm ref from within vfio while holding this spinlock, with the reference held until the last close for the device in question. Fixes: 51cdc8bc120e ("kvm/vfio: Fix potential deadlock on vfio group_lock") Reported-by: Anthony Krowiak Suggested-by: Jason Gunthorpe Signed-off-by: Matthew Rosato Tested-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20230203215027.151988-2-mjrosato@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/vfio/group.c | 44 +++++++++++++++++++++++++++------ drivers/vfio/vfio.h | 15 ++++++++++++ drivers/vfio/vfio_main.c | 63 ++++++++++++++++++++++++++++++++++++++++++------ include/linux/vfio.h | 2 +- 4 files changed, 109 insertions(+), 15 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index bb24b2f0271e..98621ac082f0 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -154,6 +154,18 @@ out_unlock: return ret; } +static void vfio_device_group_get_kvm_safe(struct vfio_device *device) +{ + spin_lock(&device->group->kvm_ref_lock); + if (!device->group->kvm) + goto unlock; + + _vfio_device_get_kvm_safe(device, device->group->kvm); + +unlock: + spin_unlock(&device->group->kvm_ref_lock); +} + static int vfio_device_group_open(struct vfio_device *device) { int ret; @@ -164,13 +176,23 @@ static int vfio_device_group_open(struct vfio_device *device) goto out_unlock; } + mutex_lock(&device->dev_set->lock); + /* - * Here we pass the KVM pointer with the group under the lock. If the - * device driver will use it, it must obtain a reference and release it - * during close_device. + * Before the first device open, get the KVM pointer currently + * associated with the group (if there is one) and obtain a reference + * now that will be held until the open_count reaches 0 again. Save + * the pointer in the device for use by drivers. */ - ret = vfio_device_open(device, device->group->iommufd, - device->group->kvm); + if (device->open_count == 0) + vfio_device_group_get_kvm_safe(device); + + ret = vfio_device_open(device, device->group->iommufd, device->kvm); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); out_unlock: mutex_unlock(&device->group->group_lock); @@ -180,7 +202,14 @@ out_unlock: void vfio_device_group_close(struct vfio_device *device) { mutex_lock(&device->group->group_lock); + mutex_lock(&device->dev_set->lock); + vfio_device_close(device, device->group->iommufd); + + if (device->open_count == 0) + vfio_device_put_kvm(device); + + mutex_unlock(&device->dev_set->lock); mutex_unlock(&device->group->group_lock); } @@ -450,6 +479,7 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group, refcount_set(&group->drivers, 1); mutex_init(&group->group_lock); + spin_lock_init(&group->kvm_ref_lock); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); group->iommu_group = iommu_group; @@ -803,9 +833,9 @@ void vfio_file_set_kvm(struct file *file, struct kvm *kvm) if (!vfio_file_is_group(file)) return; - mutex_lock(&group->group_lock); + spin_lock(&group->kvm_ref_lock); group->kvm = kvm; - mutex_unlock(&group->group_lock); + spin_unlock(&group->kvm_ref_lock); } EXPORT_SYMBOL_GPL(vfio_file_set_kvm); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index d5fa896b5a85..8ec3a5db23f5 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -74,6 +74,7 @@ struct vfio_group { struct file *opened_file; struct blocking_notifier_head notifier; struct iommufd_ctx *iommufd; + spinlock_t kvm_ref_lock; }; int vfio_device_set_group(struct vfio_device *device, @@ -244,4 +245,18 @@ extern bool vfio_noiommu __read_mostly; enum { vfio_noiommu = false }; #endif +#ifdef CONFIG_HAVE_KVM +void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm); +void vfio_device_put_kvm(struct vfio_device *device); +#else +static inline void _vfio_device_get_kvm_safe(struct vfio_device *device, + struct kvm *kvm) +{ +} + +static inline void vfio_device_put_kvm(struct vfio_device *device) +{ +} +#endif + #endif diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 5177bb061b17..28c47cd6a6b5 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -16,6 +16,9 @@ #include #include #include +#ifdef CONFIG_HAVE_KVM +#include +#endif #include #include #include @@ -338,6 +341,55 @@ void vfio_unregister_group_dev(struct vfio_device *device) } EXPORT_SYMBOL_GPL(vfio_unregister_group_dev); +#ifdef CONFIG_HAVE_KVM +void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm) +{ + void (*pfn)(struct kvm *kvm); + bool (*fn)(struct kvm *kvm); + bool ret; + + lockdep_assert_held(&device->dev_set->lock); + + pfn = symbol_get(kvm_put_kvm); + if (WARN_ON(!pfn)) + return; + + fn = symbol_get(kvm_get_kvm_safe); + if (WARN_ON(!fn)) { + symbol_put(kvm_put_kvm); + return; + } + + ret = fn(kvm); + symbol_put(kvm_get_kvm_safe); + if (!ret) { + symbol_put(kvm_put_kvm); + return; + } + + device->put_kvm = pfn; + device->kvm = kvm; +} + +void vfio_device_put_kvm(struct vfio_device *device) +{ + lockdep_assert_held(&device->dev_set->lock); + + if (!device->kvm) + return; + + if (WARN_ON(!device->put_kvm)) + goto clear; + + device->put_kvm(device->kvm); + device->put_kvm = NULL; + symbol_put(kvm_put_kvm); + +clear: + device->kvm = NULL; +} +#endif + /* true if the vfio_device has open_device() called but not close_device() */ static bool vfio_assert_device_open(struct vfio_device *device) { @@ -361,7 +413,6 @@ static int vfio_device_first_open(struct vfio_device *device, if (ret) goto err_module_put; - device->kvm = kvm; if (device->ops->open_device) { ret = device->ops->open_device(device); if (ret) @@ -370,7 +421,6 @@ static int vfio_device_first_open(struct vfio_device *device, return 0; err_unuse_iommu: - device->kvm = NULL; if (iommufd) vfio_iommufd_unbind(device); else @@ -387,7 +437,6 @@ static void vfio_device_last_close(struct vfio_device *device, if (device->ops->close_device) device->ops->close_device(device); - device->kvm = NULL; if (iommufd) vfio_iommufd_unbind(device); else @@ -400,14 +449,14 @@ int vfio_device_open(struct vfio_device *device, { int ret = 0; - mutex_lock(&device->dev_set->lock); + lockdep_assert_held(&device->dev_set->lock); + device->open_count++; if (device->open_count == 1) { ret = vfio_device_first_open(device, iommufd, kvm); if (ret) device->open_count--; } - mutex_unlock(&device->dev_set->lock); return ret; } @@ -415,12 +464,12 @@ int vfio_device_open(struct vfio_device *device, void vfio_device_close(struct vfio_device *device, struct iommufd_ctx *iommufd) { - mutex_lock(&device->dev_set->lock); + lockdep_assert_held(&device->dev_set->lock); + vfio_assert_device_open(device); if (device->open_count == 1) vfio_device_last_close(device, iommufd); device->open_count--; - mutex_unlock(&device->dev_set->lock); } /* diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 35be78e9ae57..87ff862ff555 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -46,7 +46,6 @@ struct vfio_device { struct vfio_device_set *dev_set; struct list_head dev_set_list; unsigned int migration_flags; - /* Driver must reference the kvm during open_device or never touch it */ struct kvm *kvm; /* Members below here are private, not for driver use */ @@ -58,6 +57,7 @@ struct vfio_device { struct list_head group_next; struct list_head iommu_entry; struct iommufd_access *iommufd_access; + void (*put_kvm)(struct kvm *kvm); #if IS_ENABLED(CONFIG_IOMMUFD) struct iommufd_device *iommufd_device; struct iommufd_ctx *iommufd_ictx; -- cgit v1.2.3 From b0d2d5697e4c12589ef536b2a824f6549fdb2c01 Mon Sep 17 00:00:00 2001 From: Matthew Rosato Date: Fri, 3 Feb 2023 16:50:27 -0500 Subject: vfio: no need to pass kvm pointer during device open Nothing uses this value during vfio_device_open anymore so it's safe to remove it. Signed-off-by: Matthew Rosato Tested-by: Tony Krowiak Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20230203215027.151988-3-mjrosato@linux.ibm.com Signed-off-by: Alex Williamson --- drivers/vfio/group.c | 2 +- drivers/vfio/vfio.h | 3 +-- drivers/vfio/vfio_main.c | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 98621ac082f0..0e9036e2b9c4 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -187,7 +187,7 @@ static int vfio_device_group_open(struct vfio_device *device) if (device->open_count == 0) vfio_device_group_get_kvm_safe(device); - ret = vfio_device_open(device, device->group->iommufd, device->kvm); + ret = vfio_device_open(device, device->group->iommufd); if (device->open_count == 0) vfio_device_put_kvm(device); diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h index 8ec3a5db23f5..e9721d8424bc 100644 --- a/drivers/vfio/vfio.h +++ b/drivers/vfio/vfio.h @@ -18,8 +18,7 @@ struct vfio_container; void vfio_device_put_registration(struct vfio_device *device); bool vfio_device_try_get_registration(struct vfio_device *device); -int vfio_device_open(struct vfio_device *device, - struct iommufd_ctx *iommufd, struct kvm *kvm); +int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd); void vfio_device_close(struct vfio_device *device, struct iommufd_ctx *iommufd); diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 28c47cd6a6b5..3a597e799918 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -397,7 +397,7 @@ static bool vfio_assert_device_open(struct vfio_device *device) } static int vfio_device_first_open(struct vfio_device *device, - struct iommufd_ctx *iommufd, struct kvm *kvm) + struct iommufd_ctx *iommufd) { int ret; @@ -444,8 +444,7 @@ static void vfio_device_last_close(struct vfio_device *device, module_put(device->dev->driver->owner); } -int vfio_device_open(struct vfio_device *device, - struct iommufd_ctx *iommufd, struct kvm *kvm) +int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd) { int ret = 0; @@ -453,7 +452,7 @@ int vfio_device_open(struct vfio_device *device, device->open_count++; if (device->open_count == 1) { - ret = vfio_device_first_open(device, iommufd, kvm); + ret = vfio_device_first_open(device, iommufd); if (ret) device->open_count--; } -- cgit v1.2.3 From ce06a7000f0e9ad1ea5f55129ed964a7888d6e1c Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Wed, 8 Feb 2023 17:22:34 +0200 Subject: vfio/mlx5: Fix range size calculation upon tracker creation Fix range size calculation to include the last byte of each range. In addition, log round up the length of the total ranges to be stricter. Fixes: c1d050b0d169 ("vfio/mlx5: Create and destroy page tracker object") Signed-off-by: Yishai Hadas Link: https://lore.kernel.org/r/20230208152234.32370-1-yishaih@nvidia.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/mlx5/cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5161d845c478..deed156e6165 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -830,7 +830,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, node = interval_tree_iter_first(ranges, 0, ULONG_MAX); for (i = 0; i < num_ranges; i++) { void *addr_range_i_base = range_list_ptr + record_size * i; - unsigned long length = node->last - node->start; + unsigned long length = node->last - node->start + 1; MLX5_SET64(page_track_range, addr_range_i_base, start_address, node->start); @@ -840,7 +840,7 @@ static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev, } WARN_ON(node); - log_addr_space_size = ilog2(total_ranges_len); + log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len)); if (log_addr_space_size < (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) || log_addr_space_size > -- cgit v1.2.3 From d649c34cb916b015fdcb487e51409fcc5caeca8d Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Wed, 22 Feb 2023 15:49:38 +0800 Subject: vfio: Fix NULL pointer dereference caused by uninitialized group->iommufd group->iommufd is not initialized for the iommufd_ctx_put() [20018.331541] BUG: kernel NULL pointer dereference, address: 0000000000000000 [20018.377508] RIP: 0010:iommufd_ctx_put+0x5/0x10 [iommufd] ... [20018.476483] Call Trace: [20018.479214] [20018.481555] vfio_group_fops_unl_ioctl+0x506/0x690 [vfio] [20018.487586] __x64_sys_ioctl+0x6a/0xb0 [20018.491773] ? trace_hardirqs_on+0xc5/0xe0 [20018.496347] do_syscall_64+0x67/0x90 [20018.500340] entry_SYSCALL_64_after_hwframe+0x4b/0xb5 Fixes: 9eefba8002c2 ("vfio: Move vfio group specific code into group.c") Cc: stable@vger.kernel.org Signed-off-by: Yan Zhao Reviewed-by: Jason Gunthorpe Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20230222074938.13681-1-yan.y.zhao@intel.com Signed-off-by: Alex Williamson --- drivers/vfio/group.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 0e9036e2b9c4..160deff6649b 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -137,7 +137,7 @@ static int vfio_group_ioctl_set_container(struct vfio_group *group, ret = iommufd_vfio_compat_ioas_id(iommufd, &ioas_id); if (ret) { - iommufd_ctx_put(group->iommufd); + iommufd_ctx_put(iommufd); goto out_unlock; } -- cgit v1.2.3