diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-12 00:48:42 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-09-12 00:48:42 +0300 |
commit | 78e709522d2c012cb0daad2e668506637bffb7c2 (patch) | |
tree | 899d892238891f4f2ca1dee3657cb69694c0ca34 /drivers/vdpa | |
parent | b79bd0d5102b4a3ea908018fda6b84a4c8fd6235 (diff) | |
parent | 7bc7f61897b66bef78bb5952e3d1e9f3aaf9ccca (diff) | |
download | linux-78e709522d2c012cb0daad2e668506637bffb7c2.tar.xz |
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio updates from Michael Tsirkin:
- vduse driver ("vDPA Device in Userspace") supporting emulated virtio
block devices
- virtio-vsock support for end of record with SEQPACKET
- vdpa: mac and mq support for ifcvf and mlx5
- vdpa: management netlink for ifcvf
- virtio-i2c, gpio dt bindings
- misc fixes and cleanups
* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost: (39 commits)
Documentation: Add documentation for VDUSE
vduse: Introduce VDUSE - vDPA Device in Userspace
vduse: Implement an MMU-based software IOTLB
vdpa: Support transferring virtual addressing during DMA mapping
vdpa: factor out vhost_vdpa_pa_map() and vhost_vdpa_pa_unmap()
vdpa: Add an opaque pointer for vdpa_config_ops.dma_map()
vhost-iotlb: Add an opaque pointer for vhost IOTLB
vhost-vdpa: Handle the failure of vdpa_reset()
vdpa: Add reset callback in vdpa_config_ops
vdpa: Fix some coding style issues
file: Export receive_fd() to modules
eventfd: Export eventfd_wake_count to modules
iova: Export alloc_iova_fast() and free_iova_fast()
virtio-blk: remove unneeded "likely" statements
virtio-balloon: Use virtio_find_vqs() helper
vdpa: Make use of PFN_PHYS/PFN_UP/PFN_DOWN helper macro
vsock_test: update message bounds test for MSG_EOR
af_vsock: rename variables in receive loop
virtio/vsock: support MSG_EOR bit processing
vhost/vsock: support MSG_EOR bit processing
...
Diffstat (limited to 'drivers/vdpa')
-rw-r--r-- | drivers/vdpa/Kconfig | 11 | ||||
-rw-r--r-- | drivers/vdpa/Makefile | 1 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.c | 8 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_base.h | 25 | ||||
-rw-r--r-- | drivers/vdpa/ifcvf/ifcvf_main.c | 249 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/mlx5_vdpa.h | 26 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/mr.c | 81 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/core/resources.c | 35 | ||||
-rw-r--r-- | drivers/vdpa/mlx5/net/mlx5_vnet.c | 555 | ||||
-rw-r--r-- | drivers/vdpa/vdpa.c | 9 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/Makefile | 5 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/iova_domain.c | 545 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/iova_domain.h | 73 | ||||
-rw-r--r-- | drivers/vdpa/vdpa_user/vduse_dev.c | 1641 | ||||
-rw-r--r-- | drivers/vdpa/virtio_pci/vp_vdpa.c | 17 |
16 files changed, 3107 insertions, 203 deletions
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index a503c1b2bfd9..3d91982d8371 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -33,6 +33,16 @@ config VDPA_SIM_BLOCK vDPA block device simulator which terminates IO request in a memory buffer. +config VDPA_USER + tristate "VDUSE (vDPA Device in Userspace) support" + depends on EVENTFD && MMU && HAS_DMA + select DMA_OPS + select VHOST_IOTLB + select IOMMU_IOVA + help + With VDUSE it is possible to emulate a vDPA Device + in a userspace program. + config IFCVF tristate "Intel IFC VF vDPA driver" depends on PCI_MSI @@ -53,6 +63,7 @@ config MLX5_VDPA config MLX5_VDPA_NET tristate "vDPA driver for ConnectX devices" select MLX5_VDPA + select VHOST_RING depends on MLX5_CORE help VDPA network driver for ConnectX6 and newer. Provides offloading diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile index 67fe7f3d6943..f02ebed33f19 100644 --- a/drivers/vdpa/Makefile +++ b/drivers/vdpa/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VDPA) += vdpa.o obj-$(CONFIG_VDPA_SIM) += vdpa_sim/ +obj-$(CONFIG_VDPA_USER) += vdpa_user/ obj-$(CONFIG_IFCVF) += ifcvf/ obj-$(CONFIG_MLX5_VDPA) += mlx5/ obj-$(CONFIG_VP_VDPA) += virtio_pci/ diff --git a/drivers/vdpa/ifcvf/ifcvf_base.c b/drivers/vdpa/ifcvf/ifcvf_base.c index 6e197fe0fcf9..2808f1ba9f7b 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.c +++ b/drivers/vdpa/ifcvf/ifcvf_base.c @@ -158,7 +158,9 @@ next: return -EIO; } - for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) { + hw->nr_vring = ifc_ioread16(&hw->common_cfg->num_queues); + + for (i = 0; i < hw->nr_vring; i++) { ifc_iowrite16(i, &hw->common_cfg->queue_select); notify_off = ifc_ioread16(&hw->common_cfg->queue_notify_off); hw->vring[i].notify_addr = hw->notify_base + @@ -304,7 +306,7 @@ u16 ifcvf_get_vq_state(struct ifcvf_hw *hw, u16 qid) u32 q_pair_id; ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; - q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2); + q_pair_id = qid / hw->nr_vring; avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; last_avail_idx = ifc_ioread16(avail_idx_addr); @@ -318,7 +320,7 @@ int ifcvf_set_vq_state(struct ifcvf_hw *hw, u16 qid, u16 num) u32 q_pair_id; ifcvf_lm = (struct ifcvf_lm_cfg __iomem *)hw->lm_cfg; - q_pair_id = qid / (IFCVF_MAX_QUEUE_PAIRS * 2); + q_pair_id = qid / hw->nr_vring; avail_idx_addr = &ifcvf_lm->vring_lm_cfg[q_pair_id].idx_addr[qid % 2]; hw->vring[qid].last_avail_idx = num; ifc_iowrite16(num, avail_idx_addr); diff --git a/drivers/vdpa/ifcvf/ifcvf_base.h b/drivers/vdpa/ifcvf/ifcvf_base.h index 2996db0da490..09918af3ecf8 100644 --- a/drivers/vdpa/ifcvf/ifcvf_base.h +++ b/drivers/vdpa/ifcvf/ifcvf_base.h @@ -22,17 +22,8 @@ #define N3000_DEVICE_ID 0x1041 #define N3000_SUBSYS_DEVICE_ID 0x001A -#define IFCVF_NET_SUPPORTED_FEATURES \ - ((1ULL << VIRTIO_NET_F_MAC) | \ - (1ULL << VIRTIO_F_ANY_LAYOUT) | \ - (1ULL << VIRTIO_F_VERSION_1) | \ - (1ULL << VIRTIO_NET_F_STATUS) | \ - (1ULL << VIRTIO_F_ORDER_PLATFORM) | \ - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | \ - (1ULL << VIRTIO_NET_F_MRG_RXBUF)) - -/* Only one queue pair for now. */ -#define IFCVF_MAX_QUEUE_PAIRS 1 +/* Max 8 data queue pairs(16 queues) and one control vq for now. */ +#define IFCVF_MAX_QUEUES 17 #define IFCVF_QUEUE_ALIGNMENT PAGE_SIZE #define IFCVF_QUEUE_MAX 32768 @@ -51,8 +42,6 @@ #define ifcvf_private_to_vf(adapter) \ (&((struct ifcvf_adapter *)adapter)->vf) -#define IFCVF_MAX_INTR (IFCVF_MAX_QUEUE_PAIRS * 2 + 1) - struct vring_info { u64 desc; u64 avail; @@ -83,7 +72,7 @@ struct ifcvf_hw { u32 dev_type; struct virtio_pci_common_cfg __iomem *common_cfg; void __iomem *net_cfg; - struct vring_info vring[IFCVF_MAX_QUEUE_PAIRS * 2]; + struct vring_info vring[IFCVF_MAX_QUEUES]; void __iomem * const *base; char config_msix_name[256]; struct vdpa_callback config_cb; @@ -103,7 +92,13 @@ struct ifcvf_vring_lm_cfg { struct ifcvf_lm_cfg { u8 reserved[IFCVF_LM_RING_STATE_OFFSET]; - struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUE_PAIRS]; + struct ifcvf_vring_lm_cfg vring_lm_cfg[IFCVF_MAX_QUEUES]; +}; + +struct ifcvf_vdpa_mgmt_dev { + struct vdpa_mgmt_dev mdev; + struct ifcvf_adapter *adapter; + struct pci_dev *pdev; }; int ifcvf_init_hw(struct ifcvf_hw *hw, struct pci_dev *dev); diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 351c6cfb24c3..dcd648e1f7e7 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -63,9 +63,13 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter) struct pci_dev *pdev = adapter->pdev; struct ifcvf_hw *vf = &adapter->vf; int vector, i, ret, irq; + u16 max_intr; - ret = pci_alloc_irq_vectors(pdev, IFCVF_MAX_INTR, - IFCVF_MAX_INTR, PCI_IRQ_MSIX); + /* all queues and config interrupt */ + max_intr = vf->nr_vring + 1; + + ret = pci_alloc_irq_vectors(pdev, max_intr, + max_intr, PCI_IRQ_MSIX); if (ret < 0) { IFCVF_ERR(pdev, "Failed to alloc IRQ vectors\n"); return ret; @@ -83,7 +87,7 @@ static int ifcvf_request_irq(struct ifcvf_adapter *adapter) return ret; } - for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) { + for (i = 0; i < vf->nr_vring; i++) { snprintf(vf->vring[i].msix_name, 256, "ifcvf[%s]-%d\n", pci_name(pdev), i); vector = i + IFCVF_MSI_QUEUE_OFF; @@ -112,7 +116,6 @@ static int ifcvf_start_datapath(void *private) u8 status; int ret; - vf->nr_vring = IFCVF_MAX_QUEUE_PAIRS * 2; ret = ifcvf_start_hw(vf); if (ret < 0) { status = ifcvf_get_status(vf); @@ -128,7 +131,7 @@ static int ifcvf_stop_datapath(void *private) struct ifcvf_hw *vf = ifcvf_private_to_vf(private); int i; - for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) + for (i = 0; i < vf->nr_vring; i++) vf->vring[i].cb.callback = NULL; ifcvf_stop_hw(vf); @@ -141,7 +144,7 @@ static void ifcvf_reset_vring(struct ifcvf_adapter *adapter) struct ifcvf_hw *vf = ifcvf_private_to_vf(adapter); int i; - for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) { + for (i = 0; i < vf->nr_vring; i++) { vf->vring[i].last_avail_idx = 0; vf->vring[i].desc = 0; vf->vring[i].avail = 0; @@ -171,17 +174,12 @@ static u64 ifcvf_vdpa_get_features(struct vdpa_device *vdpa_dev) struct ifcvf_adapter *adapter = vdpa_to_adapter(vdpa_dev); struct ifcvf_hw *vf = vdpa_to_vf(vdpa_dev); struct pci_dev *pdev = adapter->pdev; - + u32 type = vf->dev_type; u64 features; - switch (vf->dev_type) { - case VIRTIO_ID_NET: - features = ifcvf_get_features(vf) & IFCVF_NET_SUPPORTED_FEATURES; - break; - case VIRTIO_ID_BLOCK: + if (type == VIRTIO_ID_NET || type == VIRTIO_ID_BLOCK) features = ifcvf_get_features(vf); - break; - default: + else { features = 0; IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", vf->dev_type); } @@ -218,23 +216,12 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) int ret; vf = vdpa_to_vf(vdpa_dev); - adapter = dev_get_drvdata(vdpa_dev->dev.parent); + adapter = vdpa_to_adapter(vdpa_dev); status_old = ifcvf_get_status(vf); if (status_old == status) return; - if ((status_old & VIRTIO_CONFIG_S_DRIVER_OK) && - !(status & VIRTIO_CONFIG_S_DRIVER_OK)) { - ifcvf_stop_datapath(adapter); - ifcvf_free_irq(adapter, IFCVF_MAX_QUEUE_PAIRS * 2); - } - - if (status == 0) { - ifcvf_reset_vring(adapter); - return; - } - if ((status & VIRTIO_CONFIG_S_DRIVER_OK) && !(status_old & VIRTIO_CONFIG_S_DRIVER_OK)) { ret = ifcvf_request_irq(adapter); @@ -254,6 +241,29 @@ static void ifcvf_vdpa_set_status(struct vdpa_device *vdpa_dev, u8 status) ifcvf_set_status(vf, status); } +static int ifcvf_vdpa_reset(struct vdpa_device *vdpa_dev) +{ + struct ifcvf_adapter *adapter; + struct ifcvf_hw *vf; + u8 status_old; + + vf = vdpa_to_vf(vdpa_dev); + adapter = vdpa_to_adapter(vdpa_dev); + status_old = ifcvf_get_status(vf); + + if (status_old == 0) + return 0; + + if (status_old & VIRTIO_CONFIG_S_DRIVER_OK) { + ifcvf_stop_datapath(adapter); + ifcvf_free_irq(adapter, vf->nr_vring); + } + + ifcvf_reset_vring(adapter); + + return 0; +} + static u16 ifcvf_vdpa_get_vq_num_max(struct vdpa_device *vdpa_dev) { return IFCVF_QUEUE_MAX; @@ -437,6 +447,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .set_features = ifcvf_vdpa_set_features, .get_status = ifcvf_vdpa_get_status, .set_status = ifcvf_vdpa_set_status, + .reset = ifcvf_vdpa_reset, .get_vq_num_max = ifcvf_vdpa_get_vq_num_max, .get_vq_state = ifcvf_vdpa_get_vq_state, .set_vq_state = ifcvf_vdpa_set_vq_state, @@ -458,63 +469,63 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_vq_notification = ifcvf_get_vq_notification, }; -static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) +static struct virtio_device_id id_table_net[] = { + {VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static struct virtio_device_id id_table_blk[] = { + {VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID}, + {0}, +}; + +static u32 get_dev_type(struct pci_dev *pdev) { - struct device *dev = &pdev->dev; - struct ifcvf_adapter *adapter; - struct ifcvf_hw *vf; - int ret, i; + u32 dev_type; - ret = pcim_enable_device(pdev); - if (ret) { - IFCVF_ERR(pdev, "Failed to enable device\n"); - return ret; - } + /* This drirver drives both modern virtio devices and transitional + * devices in modern mode. + * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM, + * so legacy devices and transitional devices in legacy + * mode will not work for vDPA, this driver will not + * drive devices with legacy interface. + */ - ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), - IFCVF_DRIVER_NAME); - if (ret) { - IFCVF_ERR(pdev, "Failed to request MMIO region\n"); - return ret; - } + if (pdev->device < 0x1040) + dev_type = pdev->subsystem_device; + else + dev_type = pdev->device - 0x1040; - ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); - if (ret) { - IFCVF_ERR(pdev, "No usable DMA configuration\n"); - return ret; - } + return dev_type; +} - ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); - if (ret) { - IFCVF_ERR(pdev, - "Failed for adding devres for freeing irq vectors\n"); - return ret; - } +static int ifcvf_vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + struct ifcvf_adapter *adapter; + struct pci_dev *pdev; + struct ifcvf_hw *vf; + struct device *dev; + int ret, i; + ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); + if (ifcvf_mgmt_dev->adapter) + return -EOPNOTSUPP; + + pdev = ifcvf_mgmt_dev->pdev; + dev = &pdev->dev; adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, &ifc_vdpa_ops, NULL); + dev, &ifc_vdpa_ops, name, false); if (IS_ERR(adapter)) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return PTR_ERR(adapter); } - pci_set_master(pdev); - pci_set_drvdata(pdev, adapter); + ifcvf_mgmt_dev->adapter = adapter; + pci_set_drvdata(pdev, ifcvf_mgmt_dev); vf = &adapter->vf; - - /* This drirver drives both modern virtio devices and transitional - * devices in modern mode. - * vDPA requires feature bit VIRTIO_F_ACCESS_PLATFORM, - * so legacy devices and transitional devices in legacy - * mode will not work for vDPA, this driver will not - * drive devices with legacy interface. - */ - if (pdev->device < 0x1040) - vf->dev_type = pdev->subsystem_device; - else - vf->dev_type = pdev->device - 0x1040; - + vf->dev_type = get_dev_type(pdev); vf->base = pcim_iomap_table(pdev); adapter->pdev = pdev; @@ -526,14 +537,15 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err; } - for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) + for (i = 0; i < vf->nr_vring; i++) vf->vring[i].irq = -EINVAL; vf->hw_features = ifcvf_get_hw_features(vf); - ret = vdpa_register_device(&adapter->vdpa, IFCVF_MAX_QUEUE_PAIRS * 2); + adapter->vdpa.mdev = &ifcvf_mgmt_dev->mdev; + ret = _vdpa_register_device(&adapter->vdpa, vf->nr_vring); if (ret) { - IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus"); + IFCVF_ERR(pdev, "Failed to register to vDPA bus"); goto err; } @@ -544,11 +556,100 @@ err: return ret; } +static void ifcvf_vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + + ifcvf_mgmt_dev = container_of(mdev, struct ifcvf_vdpa_mgmt_dev, mdev); + _vdpa_unregister_device(dev); + ifcvf_mgmt_dev->adapter = NULL; +} + +static const struct vdpa_mgmtdev_ops ifcvf_vdpa_mgmt_dev_ops = { + .dev_add = ifcvf_vdpa_dev_add, + .dev_del = ifcvf_vdpa_dev_del +}; + +static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; + struct device *dev = &pdev->dev; + u32 dev_type; + int ret; + + ifcvf_mgmt_dev = kzalloc(sizeof(struct ifcvf_vdpa_mgmt_dev), GFP_KERNEL); + if (!ifcvf_mgmt_dev) { + IFCVF_ERR(pdev, "Failed to alloc memory for the vDPA management device\n"); + return -ENOMEM; + } + + dev_type = get_dev_type(pdev); + switch (dev_type) { + case VIRTIO_ID_NET: + ifcvf_mgmt_dev->mdev.id_table = id_table_net; + break; + case VIRTIO_ID_BLOCK: + ifcvf_mgmt_dev->mdev.id_table = id_table_blk; + break; + default: + IFCVF_ERR(pdev, "VIRTIO ID %u not supported\n", dev_type); + ret = -EOPNOTSUPP; + goto err; + } + + ifcvf_mgmt_dev->mdev.ops = &ifcvf_vdpa_mgmt_dev_ops; + ifcvf_mgmt_dev->mdev.device = dev; + ifcvf_mgmt_dev->pdev = pdev; + + ret = pcim_enable_device(pdev); + if (ret) { + IFCVF_ERR(pdev, "Failed to enable device\n"); + goto err; + } + + ret = pcim_iomap_regions(pdev, BIT(0) | BIT(2) | BIT(4), + IFCVF_DRIVER_NAME); + if (ret) { + IFCVF_ERR(pdev, "Failed to request MMIO region\n"); + goto err; + } + + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); + if (ret) { + IFCVF_ERR(pdev, "No usable DMA configuration\n"); + goto err; + } + + ret = devm_add_action_or_reset(dev, ifcvf_free_irq_vectors, pdev); + if (ret) { + IFCVF_ERR(pdev, + "Failed for adding devres for freeing irq vectors\n"); + goto err; + } + + pci_set_master(pdev); + + ret = vdpa_mgmtdev_register(&ifcvf_mgmt_dev->mdev); + if (ret) { + IFCVF_ERR(pdev, + "Failed to initialize the management interfaces\n"); + goto err; + } + + return 0; + +err: + kfree(ifcvf_mgmt_dev); + return ret; +} + static void ifcvf_remove(struct pci_dev *pdev) { - struct ifcvf_adapter *adapter = pci_get_drvdata(pdev); + struct ifcvf_vdpa_mgmt_dev *ifcvf_mgmt_dev; - vdpa_unregister_device(&adapter->vdpa); + ifcvf_mgmt_dev = pci_get_drvdata(pdev); + vdpa_mgmtdev_unregister(&ifcvf_mgmt_dev->mdev); + kfree(ifcvf_mgmt_dev); } static struct pci_device_id ifcvf_pci_ids[] = { diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index 0002b2136b48..01a848adf590 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -5,7 +5,7 @@ #define __MLX5_VDPA_H__ #include <linux/etherdevice.h> -#include <linux/if_vlan.h> +#include <linux/vringh.h> #include <linux/vdpa.h> #include <linux/mlx5/driver.h> @@ -48,6 +48,26 @@ struct mlx5_vdpa_resources { bool valid; }; +struct mlx5_control_vq { + struct vhost_iotlb *iotlb; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; + struct vringh vring; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + struct vdpa_callback event_cb; + struct vringh_kiov riov; + struct vringh_kiov wiov; + unsigned short head; +}; + +struct mlx5_ctrl_wq_ent { + struct work_struct work; + struct mlx5_vdpa_dev *mvdev; +}; + struct mlx5_vdpa_dev { struct vdpa_device vdev; struct mlx5_core_dev *mdev; @@ -57,9 +77,12 @@ struct mlx5_vdpa_dev { u64 actual_features; u8 status; u32 max_vqs; + u16 max_idx; u32 generation; struct mlx5_vdpa_mr mr; + struct mlx5_control_vq cvq; + struct workqueue_struct *wq; }; int mlx5_vdpa_alloc_pd(struct mlx5_vdpa_dev *dev, u32 *pdn, u16 uid); @@ -68,6 +91,7 @@ int mlx5_vdpa_get_null_mkey(struct mlx5_vdpa_dev *dev, u32 *null_mkey); int mlx5_vdpa_create_tis(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tisn); void mlx5_vdpa_destroy_tis(struct mlx5_vdpa_dev *mvdev, u32 tisn); int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 *rqtn); +int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn); void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn); int mlx5_vdpa_create_tir(struct mlx5_vdpa_dev *mvdev, void *in, u32 *tirn); void mlx5_vdpa_destroy_tir(struct mlx5_vdpa_dev *mvdev, u32 tirn); diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index e59135fa867e..ff010c6d0cd3 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2020 Mellanox Technologies Ltd. */ +#include <linux/vhost_types.h> #include <linux/vdpa.h> #include <linux/gcd.h> #include <linux/string.h> @@ -451,33 +452,30 @@ static void destroy_dma_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) mlx5_vdpa_destroy_mkey(mvdev, &mr->mkey); } -static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static int dup_iotlb(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *src) { - struct mlx5_vdpa_mr *mr = &mvdev->mr; + struct vhost_iotlb_map *map; + u64 start = 0, last = ULLONG_MAX; int err; - if (mr->initialized) - return 0; - - if (iotlb) - err = create_user_mr(mvdev, iotlb); - else - err = create_dma_mr(mvdev, mr); - - if (!err) - mr->initialized = true; + if (!src) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, start, last, start, VHOST_ACCESS_RW); + return err; + } - return err; + for (map = vhost_iotlb_itree_first(src, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + err = vhost_iotlb_add_range(mvdev->cvq.iotlb, map->start, map->last, + map->addr, map->perm); + if (err) + return err; + } + return 0; } -int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +static void prune_iotlb(struct mlx5_vdpa_dev *mvdev) { - int err; - - mutex_lock(&mvdev->mr.mkey_mtx); - err = _mlx5_vdpa_create_mr(mvdev, iotlb); - mutex_unlock(&mvdev->mr.mkey_mtx); - return err; + vhost_iotlb_del_range(mvdev->cvq.iotlb, 0, ULLONG_MAX); } static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr) @@ -501,6 +499,7 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) if (!mr->initialized) goto out; + prune_iotlb(mvdev); if (mr->user_mr) destroy_user_mr(mvdev, mr); else @@ -512,6 +511,48 @@ out: mutex_unlock(&mr->mkey_mtx); } +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + int err; + + if (mr->initialized) + return 0; + + if (iotlb) + err = create_user_mr(mvdev, iotlb); + else + err = create_dma_mr(mvdev, mr); + + if (err) + return err; + + err = dup_iotlb(mvdev, iotlb); + if (err) + goto out_err; + + mr->initialized = true; + return 0; + +out_err: + if (iotlb) + destroy_user_mr(mvdev, mr); + else + destroy_dma_mr(mvdev, mr); + + return err; +} + +int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) +{ + int err; + + mutex_lock(&mvdev->mr.mkey_mtx); + err = _mlx5_vdpa_create_mr(mvdev, iotlb); + mutex_unlock(&mvdev->mr.mkey_mtx); + return err; +} + int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, bool *change_map) { diff --git a/drivers/vdpa/mlx5/core/resources.c b/drivers/vdpa/mlx5/core/resources.c index d4606213f88a..15e266d0e27a 100644 --- a/drivers/vdpa/mlx5/core/resources.c +++ b/drivers/vdpa/mlx5/core/resources.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* Copyright (c) 2020 Mellanox Technologies Ltd. */ +#include <linux/iova.h> #include <linux/mlx5/driver.h> #include "mlx5_vdpa.h" @@ -128,6 +129,16 @@ int mlx5_vdpa_create_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 * return err; } +int mlx5_vdpa_modify_rqt(struct mlx5_vdpa_dev *mvdev, void *in, int inlen, u32 rqtn) +{ + u32 out[MLX5_ST_SZ_DW(create_rqt_out)] = {}; + + MLX5_SET(modify_rqt_in, in, uid, mvdev->res.uid); + MLX5_SET(modify_rqt_in, in, rqtn, rqtn); + MLX5_SET(modify_rqt_in, in, opcode, MLX5_CMD_OP_MODIFY_RQT); + return mlx5_cmd_exec(mvdev->mdev, in, inlen, out, sizeof(out)); +} + void mlx5_vdpa_destroy_rqt(struct mlx5_vdpa_dev *mvdev, u32 rqtn) { u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; @@ -221,6 +232,22 @@ int mlx5_vdpa_destroy_mkey(struct mlx5_vdpa_dev *mvdev, struct mlx5_core_mkey *m return mlx5_cmd_exec_in(mvdev->mdev, destroy_mkey, in); } +static int init_ctrl_vq(struct mlx5_vdpa_dev *mvdev) +{ + mvdev->cvq.iotlb = vhost_iotlb_alloc(0, 0); + if (!mvdev->cvq.iotlb) + return -ENOMEM; + + vringh_set_iotlb(&mvdev->cvq.vring, mvdev->cvq.iotlb, &mvdev->cvq.iommu_lock); + + return 0; +} + +static void cleanup_ctrl_vq(struct mlx5_vdpa_dev *mvdev) +{ + vhost_iotlb_free(mvdev->cvq.iotlb); +} + int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) { u64 offset = MLX5_CAP64_DEV_VDPA_EMULATION(mvdev->mdev, doorbell_bar_offset); @@ -260,10 +287,17 @@ int mlx5_vdpa_alloc_resources(struct mlx5_vdpa_dev *mvdev) err = -ENOMEM; goto err_key; } + + err = init_ctrl_vq(mvdev); + if (err) + goto err_ctrl; + res->valid = true; return 0; +err_ctrl: + iounmap(res->kick_addr); err_key: dealloc_pd(mvdev, res->pdn, res->uid); err_pd: @@ -282,6 +316,7 @@ void mlx5_vdpa_free_resources(struct mlx5_vdpa_dev *mvdev) if (!res->valid) return; + cleanup_ctrl_vq(mvdev); iounmap(res->kick_addr); res->kick_addr = NULL; dealloc_pd(mvdev, res->pdn, res->uid); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 5906cada2293..294ba05e6fc9 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -45,6 +45,8 @@ MODULE_LICENSE("Dual BSD/GPL"); (VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK | \ VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED) +#define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature))) + struct mlx5_vdpa_net_resources { u32 tisn; u32 tdn; @@ -90,7 +92,6 @@ struct mlx5_vq_restore_info { u16 avail_index; u16 used_index; bool ready; - struct vdpa_callback cb; bool restore; }; @@ -100,7 +101,6 @@ struct mlx5_vdpa_virtqueue { u64 device_addr; u64 driver_addr; u32 num_ent; - struct vdpa_callback event_cb; /* Resources for implementing the notification channel from the device * to the driver. fwqp is the firmware end of an RC connection; the @@ -135,11 +135,20 @@ struct mlx5_vdpa_virtqueue { */ #define MLX5_MAX_SUPPORTED_VQS 16 +static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx) +{ + if (unlikely(idx > mvdev->max_idx)) + return false; + + return true; +} + struct mlx5_vdpa_net { struct mlx5_vdpa_dev mvdev; struct mlx5_vdpa_net_resources res; struct virtio_net_config config; struct mlx5_vdpa_virtqueue vqs[MLX5_MAX_SUPPORTED_VQS]; + struct vdpa_callback event_cbs[MLX5_MAX_SUPPORTED_VQS + 1]; /* Serialize vq resources creation and destruction. This is required * since memory map might change and we need to destroy and create @@ -151,15 +160,18 @@ struct mlx5_vdpa_net { struct mlx5_flow_handle *rx_rule; bool setup; u16 mtu; + u32 cur_num_vqs; }; static void free_resources(struct mlx5_vdpa_net *ndev); static void init_mvqs(struct mlx5_vdpa_net *ndev); -static int setup_driver(struct mlx5_vdpa_net *ndev); +static int setup_driver(struct mlx5_vdpa_dev *mvdev); static void teardown_driver(struct mlx5_vdpa_net *ndev); static bool mlx5_vdpa_debug; +#define MLX5_CVQ_MAX_ENT 16 + #define MLX5_LOG_VIO_FLAG(_feature) \ do { \ if (features & BIT_ULL(_feature)) \ @@ -172,11 +184,41 @@ static bool mlx5_vdpa_debug; mlx5_vdpa_info(mvdev, "%s\n", #_status); \ } while (0) +/* TODO: cross-endian support */ +static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) +{ + return virtio_legacy_is_little_endian() || + (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1)); +} + +static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val) +{ + return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val); +} + +static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) +{ + return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val); +} + static inline u32 mlx5_vdpa_max_qps(int max_vqs) { return max_vqs / 2; } +static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev) +{ + if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) + return 2; + + return 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); +} + +static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx) +{ + return idx == ctrl_vq_idx(mvdev); +} + static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set) { if (status & ~VALID_STATUS_MASK) @@ -481,6 +523,10 @@ static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq) static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num) { + struct mlx5_vdpa_net *ndev = mvq->ndev; + struct vdpa_callback *event_cb; + + event_cb = &ndev->event_cbs[mvq->index]; mlx5_cq_set_ci(&mvq->cq.mcq); /* make sure CQ cosumer update is visible to the hardware before updating @@ -488,8 +534,8 @@ static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int nu */ dma_wmb(); rx_post(&mvq->vqqp, num); - if (mvq->event_cb.callback) - mvq->event_cb.callback(mvq->event_cb.private); + if (event_cb->callback) + event_cb->callback(event_cb->private); } static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe) @@ -1100,10 +1146,8 @@ static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) if (!mvq->num_ent) return 0; - if (mvq->initialized) { - mlx5_vdpa_warn(&ndev->mvdev, "attempt re init\n"); - return -EINVAL; - } + if (mvq->initialized) + return 0; err = cq_create(ndev, idx, mvq->num_ent); if (err) @@ -1190,19 +1234,20 @@ static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue * static int create_rqt(struct mlx5_vdpa_net *ndev) { - int log_max_rqt; __be32 *list; + int max_rqt; void *rqtc; int inlen; void *in; int i, j; int err; - log_max_rqt = min_t(int, 1, MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); - if (log_max_rqt < 1) + max_rqt = min_t(int, MLX5_MAX_SUPPORTED_VQS / 2, + 1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); + if (max_rqt < 1) return -EOPNOTSUPP; - inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + (1 << log_max_rqt) * MLX5_ST_SZ_BYTES(rq_num); + inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num); in = kzalloc(inlen, GFP_KERNEL); if (!in) return -ENOMEM; @@ -1211,10 +1256,9 @@ static int create_rqt(struct mlx5_vdpa_net *ndev) rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context); MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); - MLX5_SET(rqtc, rqtc, rqt_max_size, 1 << log_max_rqt); - MLX5_SET(rqtc, rqtc, rqt_actual_size, 1); + MLX5_SET(rqtc, rqtc, rqt_max_size, max_rqt); list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); - for (i = 0, j = 0; j < ndev->mvdev.max_vqs; j++) { + for (i = 0, j = 0; j < max_rqt; j++) { if (!ndev->vqs[j].initialized) continue; @@ -1223,6 +1267,7 @@ static int create_rqt(struct mlx5_vdpa_net *ndev) i++; } } + MLX5_SET(rqtc, rqtc, rqt_actual_size, i); err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn); kfree(in); @@ -1232,6 +1277,52 @@ static int create_rqt(struct mlx5_vdpa_net *ndev) return 0; } +#define MLX5_MODIFY_RQT_NUM_RQS ((u64)1) + +static int modify_rqt(struct mlx5_vdpa_net *ndev, int num) +{ + __be32 *list; + int max_rqt; + void *rqtc; + int inlen; + void *in; + int i, j; + int err; + + max_rqt = min_t(int, ndev->cur_num_vqs / 2, + 1 << MLX5_CAP_GEN(ndev->mvdev.mdev, log_max_rqt_size)); + if (max_rqt < 1) + return -EOPNOTSUPP; + + inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + max_rqt * MLX5_ST_SZ_BYTES(rq_num); + in = kzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid); + MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS); + rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx); + MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q); + + list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]); + for (i = 0, j = 0; j < num; j++) { + if (!ndev->vqs[j].initialized) + continue; + + if (!vq_is_tx(ndev->vqs[j].index)) { + list[i] = cpu_to_be32(ndev->vqs[j].virtq_id); + i++; + } + } + MLX5_SET(rqtc, rqtc, rqt_actual_size, i); + err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn); + kfree(in); + if (err) + return err; + + return 0; +} + static void destroy_rqt(struct mlx5_vdpa_net *ndev) { mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn); @@ -1345,12 +1436,206 @@ static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev) ndev->rx_rule = NULL; } +static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_control_vq *cvq = &mvdev->cvq; + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct mlx5_core_dev *pfmdev; + size_t read; + u8 mac[ETH_ALEN]; + + pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); + switch (cmd) { + case VIRTIO_NET_CTRL_MAC_ADDR_SET: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN); + if (read != ETH_ALEN) + break; + + if (!memcmp(ndev->config.mac, mac, 6)) { + status = VIRTIO_NET_OK; + break; + } + + if (!is_zero_ether_addr(ndev->config.mac)) { + if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) { + mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n", + ndev->config.mac); + break; + } + } + + if (mlx5_mpfs_add_mac(pfmdev, mac)) { + mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n", + mac); + break; + } + + memcpy(ndev->config.mac, mac, ETH_ALEN); + status = VIRTIO_NET_OK; + break; + + default: + break; + } + + return status; +} + +static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int cur_qps = ndev->cur_num_vqs / 2; + int err; + int i; + + if (cur_qps > newqps) { + err = modify_rqt(ndev, 2 * newqps); + if (err) + return err; + + for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--) + teardown_vq(ndev, &ndev->vqs[i]); + + ndev->cur_num_vqs = 2 * newqps; + } else { + ndev->cur_num_vqs = 2 * newqps; + for (i = cur_qps * 2; i < 2 * newqps; i++) { + err = setup_vq(ndev, &ndev->vqs[i]); + if (err) + goto clean_added; + } + err = modify_rqt(ndev, 2 * newqps); + if (err) + goto clean_added; + } + return 0; + +clean_added: + for (--i; i >= cur_qps; --i) + teardown_vq(ndev, &ndev->vqs[i]); + + return err; +} + +static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd) +{ + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct mlx5_control_vq *cvq = &mvdev->cvq; + struct virtio_net_ctrl_mq mq; + size_t read; + u16 newqps; + + switch (cmd) { + case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET: + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq)); + if (read != sizeof(mq)) + break; + + newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs); + if (ndev->cur_num_vqs == 2 * newqps) { + status = VIRTIO_NET_OK; + break; + } + + if (newqps & (newqps - 1)) + break; + + if (!change_num_qps(mvdev, newqps)) + status = VIRTIO_NET_OK; + + break; + default: + break; + } + + return status; +} + +static void mlx5_cvq_kick_handler(struct work_struct *work) +{ + virtio_net_ctrl_ack status = VIRTIO_NET_ERR; + struct virtio_net_ctrl_hdr ctrl; + struct mlx5_ctrl_wq_ent *wqent; + struct mlx5_vdpa_dev *mvdev; + struct mlx5_control_vq *cvq; + struct mlx5_vdpa_net *ndev; + size_t read, write; + int err; + + wqent = container_of(work, struct mlx5_ctrl_wq_ent, work); + mvdev = wqent->mvdev; + ndev = to_mlx5_vdpa_ndev(mvdev); + cvq = &mvdev->cvq; + if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) + goto out; + + if (!cvq->ready) + goto out; + + while (true) { + err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head, + GFP_ATOMIC); + if (err <= 0) + break; + + read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl)); + if (read != sizeof(ctrl)) + break; + + switch (ctrl.class) { + case VIRTIO_NET_CTRL_MAC: + status = handle_ctrl_mac(mvdev, ctrl.cmd); + break; + case VIRTIO_NET_CTRL_MQ: + status = handle_ctrl_mq(mvdev, ctrl.cmd); + break; + + default: + break; + } + + /* Make sure data is written before advancing index */ + smp_wmb(); + + write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status)); + vringh_complete_iotlb(&cvq->vring, cvq->head, write); + vringh_kiov_cleanup(&cvq->riov); + vringh_kiov_cleanup(&cvq->wiov); + + if (vringh_need_notify_iotlb(&cvq->vring)) + vringh_notify(&cvq->vring); + } +out: + kfree(wqent); +} + static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_vdpa_virtqueue *mvq; + struct mlx5_ctrl_wq_ent *wqent; + + if (!is_index_valid(mvdev, idx)) + return; + + if (unlikely(is_ctrl_vq_idx(mvdev, idx))) { + if (!mvdev->cvq.ready) + return; + + wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC); + if (!wqent) + return; + wqent->mvdev = mvdev; + INIT_WORK(&wqent->work, mlx5_cvq_kick_handler); + queue_work(mvdev->wq, &wqent->work); + return; + } + + mvq = &ndev->vqs[idx]; if (unlikely(!mvq->ready)) return; @@ -1362,8 +1647,19 @@ static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_ { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + if (is_ctrl_vq_idx(mvdev, idx)) { + mvdev->cvq.desc_addr = desc_area; + mvdev->cvq.device_addr = device_area; + mvdev->cvq.driver_addr = driver_area; + return 0; + } + + mvq = &ndev->vqs[idx]; mvq->desc_addr = desc_area; mvq->device_addr = device_area; mvq->driver_addr = driver_area; @@ -1376,6 +1672,9 @@ static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num) struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); struct mlx5_vdpa_virtqueue *mvq; + if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx)) + return; + mvq = &ndev->vqs[idx]; mvq->num_ent = num; } @@ -1384,17 +1683,46 @@ static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_c { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *vq = &ndev->vqs[idx]; - vq->event_cb = *cb; + ndev->event_cbs[idx] = *cb; +} + +static void mlx5_cvq_notify(struct vringh *vring) +{ + struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring); + + if (!cvq->event_cb.callback) + return; + + cvq->event_cb.callback(cvq->event_cb.private); +} + +static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready) +{ + struct mlx5_control_vq *cvq = &mvdev->cvq; + + cvq->ready = ready; + if (!ready) + return; + + cvq->vring.notify = mlx5_cvq_notify; } static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_vdpa_virtqueue *mvq; + + if (!is_index_valid(mvdev, idx)) + return; + + if (is_ctrl_vq_idx(mvdev, idx)) { + set_cvq_ready(mvdev, ready); + return; + } + mvq = &ndev->vqs[idx]; if (!ready) suspend_vq(ndev, mvq); @@ -1405,9 +1733,14 @@ static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; - return mvq->ready; + if (!is_index_valid(mvdev, idx)) + return false; + + if (is_ctrl_vq_idx(mvdev, idx)) + return mvdev->cvq.ready; + + return ndev->vqs[idx].ready; } static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, @@ -1415,8 +1748,17 @@ static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx, { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_vdpa_virtqueue *mvq; + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) { + mvdev->cvq.vring.last_avail_idx = state->split.avail_index; + return 0; + } + + mvq = &ndev->vqs[idx]; if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) { mlx5_vdpa_warn(mvdev, "can't modify available index\n"); return -EINVAL; @@ -1431,10 +1773,19 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); - struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx]; + struct mlx5_vdpa_virtqueue *mvq; struct mlx5_virtq_attr attr; int err; + if (!is_index_valid(mvdev, idx)) + return -EINVAL; + + if (is_ctrl_vq_idx(mvdev, idx)) { + state->split.avail_index = mvdev->cvq.vring.last_avail_idx; + return 0; + } + + mvq = &ndev->vqs[idx]; /* If the virtq object was destroyed, use the value saved at * the last minute of suspend_vq. This caters for userspace * that cares about emulating the index after vq is stopped. @@ -1491,10 +1842,14 @@ static u64 mlx5_vdpa_get_features(struct vdpa_device *vdev) u16 dev_features; dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, device_features_bits_mask); - ndev->mvdev.mlx_features = mlx_to_vritio_features(dev_features); + ndev->mvdev.mlx_features |= mlx_to_vritio_features(dev_features); if (MLX5_CAP_DEV_VDPA_EMULATION(mvdev->mdev, virtio_version_1_0)) ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_VERSION_1); ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM); + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ); + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR); + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MQ); + print_features(mvdev, ndev->mvdev.mlx_features, false); return ndev->mvdev.mlx_features; } @@ -1507,17 +1862,29 @@ static int verify_min_features(struct mlx5_vdpa_dev *mvdev, u64 features) return 0; } -static int setup_virtqueues(struct mlx5_vdpa_net *ndev) +static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev) { + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + struct mlx5_control_vq *cvq = &mvdev->cvq; int err; int i; - for (i = 0; i < 2 * mlx5_vdpa_max_qps(ndev->mvdev.max_vqs); i++) { + for (i = 0; i < 2 * mlx5_vdpa_max_qps(mvdev->max_vqs); i++) { err = setup_vq(ndev, &ndev->vqs[i]); if (err) goto err_vq; } + if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) { + err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features, + MLX5_CVQ_MAX_ENT, false, + (struct vring_desc *)(uintptr_t)cvq->desc_addr, + (struct vring_avail *)(uintptr_t)cvq->driver_addr, + (struct vring_used *)(uintptr_t)cvq->device_addr); + if (err) + goto err_vq; + } + return 0; err_vq: @@ -1541,16 +1908,22 @@ static void teardown_virtqueues(struct mlx5_vdpa_net *ndev) } } -/* TODO: cross-endian support */ -static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev) -{ - return virtio_legacy_is_little_endian() || - (mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1)); -} - -static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val) +static void update_cvq_info(struct mlx5_vdpa_dev *mvdev) { - return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val); + if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) { + if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) { + /* MQ supported. CVQ index is right above the last data virtqueue's */ + mvdev->max_idx = mvdev->max_vqs; + } else { + /* Only CVQ supportted. data virtqueues occupy indices 0 and 1. + * CVQ gets index 2 + */ + mvdev->max_idx = 2; + } + } else { + /* Two data virtqueues only: one for rx and one for tx */ + mvdev->max_idx = 1; + } } static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) @@ -1568,6 +1941,7 @@ static int mlx5_vdpa_set_features(struct vdpa_device *vdev, u64 features) ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features; ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, ndev->mtu); ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP); + update_cvq_info(mvdev); return err; } @@ -1605,15 +1979,14 @@ static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev) static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq) { struct mlx5_vq_restore_info *ri = &mvq->ri; - struct mlx5_virtq_attr attr; + struct mlx5_virtq_attr attr = {}; int err; - if (!mvq->initialized) - return 0; - - err = query_virtqueue(ndev, mvq, &attr); - if (err) - return err; + if (mvq->initialized) { + err = query_virtqueue(ndev, mvq, &attr); + if (err) + return err; + } ri->avail_index = attr.available_index; ri->used_index = attr.used_index; @@ -1622,7 +1995,6 @@ static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqu ri->desc_addr = mvq->desc_addr; ri->device_addr = mvq->device_addr; ri->driver_addr = mvq->driver_addr; - ri->cb = mvq->event_cb; ri->restore = true; return 0; } @@ -1667,12 +2039,12 @@ static void restore_channels_info(struct mlx5_vdpa_net *ndev) mvq->desc_addr = ri->desc_addr; mvq->device_addr = ri->device_addr; mvq->driver_addr = ri->driver_addr; - mvq->event_cb = ri->cb; } } -static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb *iotlb) +static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb) { + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; suspend_vqs(ndev); @@ -1681,58 +2053,59 @@ static int mlx5_vdpa_change_map(struct mlx5_vdpa_net *ndev, struct vhost_iotlb * goto err_mr; teardown_driver(ndev); - mlx5_vdpa_destroy_mr(&ndev->mvdev); - err = mlx5_vdpa_create_mr(&ndev->mvdev, iotlb); + mlx5_vdpa_destroy_mr(mvdev); + err = mlx5_vdpa_create_mr(mvdev, iotlb); if (err) goto err_mr; - if (!(ndev->mvdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) + if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) return 0; restore_channels_info(ndev); - err = setup_driver(ndev); + err = setup_driver(mvdev); if (err) goto err_setup; return 0; err_setup: - mlx5_vdpa_destroy_mr(&ndev->mvdev); + mlx5_vdpa_destroy_mr(mvdev); err_mr: return err; } -static int setup_driver(struct mlx5_vdpa_net *ndev) +static int setup_driver(struct mlx5_vdpa_dev *mvdev) { + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); int err; mutex_lock(&ndev->reslock); if (ndev->setup) { - mlx5_vdpa_warn(&ndev->mvdev, "setup driver called for already setup driver\n"); + mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n"); err = 0; goto out; } - err = setup_virtqueues(ndev); + err = setup_virtqueues(mvdev); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "setup_virtqueues\n"); + mlx5_vdpa_warn(mvdev, "setup_virtqueues\n"); goto out; } err = create_rqt(ndev); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "create_rqt\n"); + mlx5_vdpa_warn(mvdev, "create_rqt\n"); goto err_rqt; } err = create_tir(ndev); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "create_tir\n"); + mlx5_vdpa_warn(mvdev, "create_tir\n"); goto err_tir; } err = add_fwd_to_tir(ndev); if (err) { - mlx5_vdpa_warn(&ndev->mvdev, "add_fwd_to_tir\n"); + mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n"); goto err_fwd; } ndev->setup = true; @@ -1781,24 +2154,10 @@ static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status) int err; print_status(mvdev, status, true); - if (!status) { - mlx5_vdpa_info(mvdev, "performing device reset\n"); - teardown_driver(ndev); - clear_vqs_ready(ndev); - mlx5_vdpa_destroy_mr(&ndev->mvdev); - ndev->mvdev.status = 0; - ndev->mvdev.mlx_features = 0; - ++mvdev->generation; - if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - if (mlx5_vdpa_create_mr(mvdev, NULL)) - mlx5_vdpa_warn(mvdev, "create MR failed\n"); - } - return; - } if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) { if (status & VIRTIO_CONFIG_S_DRIVER_OK) { - err = setup_driver(ndev); + err = setup_driver(mvdev); if (err) { mlx5_vdpa_warn(mvdev, "failed to setup driver\n"); goto err_setup; @@ -1817,6 +2176,29 @@ err_setup: ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED; } +static int mlx5_vdpa_reset(struct vdpa_device *vdev) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + + print_status(mvdev, 0, true); + mlx5_vdpa_info(mvdev, "performing device reset\n"); + teardown_driver(ndev); + clear_vqs_ready(ndev); + mlx5_vdpa_destroy_mr(&ndev->mvdev); + ndev->mvdev.status = 0; + ndev->mvdev.mlx_features = 0; + memset(ndev->event_cbs, 0, sizeof(ndev->event_cbs)); + ndev->mvdev.actual_features = 0; + ++mvdev->generation; + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + if (mlx5_vdpa_create_mr(mvdev, NULL)) + mlx5_vdpa_warn(mvdev, "create MR failed\n"); + } + + return 0; +} + static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev) { return sizeof(struct virtio_net_config); @@ -1848,7 +2230,6 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); - struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); bool change_map; int err; @@ -1859,7 +2240,7 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb } if (change_map) - return mlx5_vdpa_change_map(ndev, iotlb); + return mlx5_vdpa_change_map(mvdev, iotlb); return 0; } @@ -1889,6 +2270,9 @@ static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device struct mlx5_vdpa_net *ndev; phys_addr_t addr; + if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx)) + return ret; + /* If SF BAR size is smaller than PAGE_SIZE, do not use direct * notification to avoid the risk of mapping pages that contain BAR of more * than one SF @@ -1928,6 +2312,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vendor_id = mlx5_vdpa_get_vendor_id, .get_status = mlx5_vdpa_get_status, .set_status = mlx5_vdpa_set_status, + .reset = mlx5_vdpa_reset, .get_config_size = mlx5_vdpa_get_config_size, .get_config = mlx5_vdpa_get_config, .set_config = mlx5_vdpa_set_config, @@ -2040,7 +2425,7 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops, - name); + name, false); if (IS_ERR(ndev)) return PTR_ERR(ndev); @@ -2063,8 +2448,11 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) err = mlx5_mpfs_add_mac(pfmdev, config->mac); if (err) goto err_mtu; + + ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC); } + config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, mlx5_vdpa_max_qps(max_vqs)); mvdev->vdev.dma_dev = &mdev->pdev->dev; err = mlx5_vdpa_alloc_resources(&ndev->mvdev); if (err) @@ -2080,8 +2468,15 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) if (err) goto err_mr; + mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_ctrl_wq"); + if (!mvdev->wq) { + err = -ENOMEM; + goto err_res2; + } + + ndev->cur_num_vqs = 2 * mlx5_vdpa_max_qps(max_vqs); mvdev->vdev.mdev = &mgtdev->mgtdev; - err = _vdpa_register_device(&mvdev->vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); + err = _vdpa_register_device(&mvdev->vdev, ndev->cur_num_vqs + 1); if (err) goto err_reg; @@ -2089,6 +2484,8 @@ static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name) return 0; err_reg: + destroy_workqueue(mvdev->wq); +err_res2: free_resources(ndev); err_mr: mlx5_vdpa_destroy_mr(mvdev); @@ -2106,7 +2503,9 @@ err_mtu: static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev) { struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev); + struct mlx5_vdpa_dev *mvdev = to_mvdev(dev); + destroy_workqueue(mvdev->wq); _vdpa_unregister_device(dev); mgtdev->ndev = NULL; } diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 3fc4525fc05c..1dc121a07a93 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -69,6 +69,7 @@ static void vdpa_release_dev(struct device *d) * @config: the bus operations that is supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. + * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. @@ -78,7 +79,8 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - size_t size, const char *name) + size_t size, const char *name, + bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; @@ -89,6 +91,10 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, if (!!config->dma_map != !!config->dma_unmap) goto err; + /* It should only work for the device that use on-chip IOMMU */ + if (use_va && !(config->dma_map || config->set_map)) + goto err; + err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) @@ -104,6 +110,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->index = err; vdev->config = config; vdev->features_valid = false; + vdev->use_va = use_va; if (name) err = dev_set_name(&vdev->dev, "%s", name); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c621cf7feec0..5f484fff8dbe 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -92,7 +92,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->vring.notify = NULL; } -static void vdpasim_reset(struct vdpasim *vdpasim) +static void vdpasim_do_reset(struct vdpasim *vdpasim) { int i; @@ -137,7 +137,8 @@ static dma_addr_t vdpasim_map_range(struct vdpasim *vdpasim, phys_addr_t paddr, int ret; /* We set the limit_pfn to the maximum (ULONG_MAX - 1) */ - iova = alloc_iova(&vdpasim->iova, size, ULONG_MAX - 1, true); + iova = alloc_iova(&vdpasim->iova, size >> iova_shift(&vdpasim->iova), + ULONG_MAX - 1, true); if (!iova) return DMA_MAPPING_ERROR; @@ -250,7 +251,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) ops = &vdpasim_config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - dev_attr->name); + dev_attr->name, false); if (IS_ERR(vdpasim)) { ret = PTR_ERR(vdpasim); goto err_alloc; @@ -459,11 +460,21 @@ static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status) spin_lock(&vdpasim->lock); vdpasim->status = status; - if (status == 0) - vdpasim_reset(vdpasim); spin_unlock(&vdpasim->lock); } +static int vdpasim_reset(struct vdpa_device *vdpa) +{ + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + spin_lock(&vdpasim->lock); + vdpasim->status = 0; + vdpasim_do_reset(vdpasim); + spin_unlock(&vdpasim->lock); + + return 0; +} + static size_t vdpasim_get_config_size(struct vdpa_device *vdpa) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); @@ -544,14 +555,14 @@ err: } static int vdpasim_dma_map(struct vdpa_device *vdpa, u64 iova, u64 size, - u64 pa, u32 perm) + u64 pa, u32 perm, void *opaque) { struct vdpasim *vdpasim = vdpa_to_sim(vdpa); int ret; spin_lock(&vdpasim->iommu_lock); - ret = vhost_iotlb_add_range(vdpasim->iommu, iova, iova + size - 1, pa, - perm); + ret = vhost_iotlb_add_range_ctx(vdpasim->iommu, iova, iova + size - 1, + pa, perm, opaque); spin_unlock(&vdpasim->iommu_lock); return ret; @@ -607,6 +618,7 @@ static const struct vdpa_config_ops vdpasim_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .reset = vdpasim_reset, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, @@ -635,6 +647,7 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { .get_vendor_id = vdpasim_get_vendor_id, .get_status = vdpasim_get_status, .set_status = vdpasim_set_status, + .reset = vdpasim_reset, .get_config_size = vdpasim_get_config_size, .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, diff --git a/drivers/vdpa/vdpa_user/Makefile b/drivers/vdpa/vdpa_user/Makefile new file mode 100644 index 000000000000..260e0b26af99 --- /dev/null +++ b/drivers/vdpa/vdpa_user/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0 + +vduse-y := vduse_dev.o iova_domain.o + +obj-$(CONFIG_VDPA_USER) += vduse.o diff --git a/drivers/vdpa/vdpa_user/iova_domain.c b/drivers/vdpa/vdpa_user/iova_domain.c new file mode 100644 index 000000000000..1daae2608860 --- /dev/null +++ b/drivers/vdpa/vdpa_user/iova_domain.c @@ -0,0 +1,545 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MMU-based software IOTLB. + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/anon_inodes.h> +#include <linux/highmem.h> +#include <linux/vmalloc.h> +#include <linux/vdpa.h> + +#include "iova_domain.h" + +static int vduse_iotlb_add_range(struct vduse_iova_domain *domain, + u64 start, u64 last, + u64 addr, unsigned int perm, + struct file *file, u64 offset) +{ + struct vdpa_map_file *map_file; + int ret; + + map_file = kmalloc(sizeof(*map_file), GFP_ATOMIC); + if (!map_file) + return -ENOMEM; + + map_file->file = get_file(file); + map_file->offset = offset; + + ret = vhost_iotlb_add_range_ctx(domain->iotlb, start, last, + addr, perm, map_file); + if (ret) { + fput(map_file->file); + kfree(map_file); + return ret; + } + return 0; +} + +static void vduse_iotlb_del_range(struct vduse_iova_domain *domain, + u64 start, u64 last) +{ + struct vdpa_map_file *map_file; + struct vhost_iotlb_map *map; + + while ((map = vhost_iotlb_itree_first(domain->iotlb, start, last))) { + map_file = (struct vdpa_map_file *)map->opaque; + fput(map_file->file); + kfree(map_file); + vhost_iotlb_map_free(domain->iotlb, map); + } +} + +int vduse_domain_set_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb) +{ + struct vdpa_map_file *map_file; + struct vhost_iotlb_map *map; + u64 start = 0ULL, last = ULLONG_MAX; + int ret; + + spin_lock(&domain->iotlb_lock); + vduse_iotlb_del_range(domain, start, last); + + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + map_file = (struct vdpa_map_file *)map->opaque; + ret = vduse_iotlb_add_range(domain, map->start, map->last, + map->addr, map->perm, + map_file->file, + map_file->offset); + if (ret) + goto err; + } + spin_unlock(&domain->iotlb_lock); + + return 0; +err: + vduse_iotlb_del_range(domain, start, last); + spin_unlock(&domain->iotlb_lock); + return ret; +} + +void vduse_domain_clear_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb) +{ + struct vhost_iotlb_map *map; + u64 start = 0ULL, last = ULLONG_MAX; + + spin_lock(&domain->iotlb_lock); + for (map = vhost_iotlb_itree_first(iotlb, start, last); map; + map = vhost_iotlb_itree_next(map, start, last)) { + vduse_iotlb_del_range(domain, map->start, map->last); + } + spin_unlock(&domain->iotlb_lock); +} + +static int vduse_domain_map_bounce_page(struct vduse_iova_domain *domain, + u64 iova, u64 size, u64 paddr) +{ + struct vduse_bounce_map *map; + u64 last = iova + size - 1; + + while (iova <= last) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + if (!map->bounce_page) { + map->bounce_page = alloc_page(GFP_ATOMIC); + if (!map->bounce_page) + return -ENOMEM; + } + map->orig_phys = paddr; + paddr += PAGE_SIZE; + iova += PAGE_SIZE; + } + return 0; +} + +static void vduse_domain_unmap_bounce_page(struct vduse_iova_domain *domain, + u64 iova, u64 size) +{ + struct vduse_bounce_map *map; + u64 last = iova + size - 1; + + while (iova <= last) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + map->orig_phys = INVALID_PHYS_ADDR; + iova += PAGE_SIZE; + } +} + +static void do_bounce(phys_addr_t orig, void *addr, size_t size, + enum dma_data_direction dir) +{ + unsigned long pfn = PFN_DOWN(orig); + unsigned int offset = offset_in_page(orig); + char *buffer; + unsigned int sz = 0; + + while (size) { + sz = min_t(size_t, PAGE_SIZE - offset, size); + + buffer = kmap_atomic(pfn_to_page(pfn)); + if (dir == DMA_TO_DEVICE) + memcpy(addr, buffer + offset, sz); + else + memcpy(buffer + offset, addr, sz); + kunmap_atomic(buffer); + + size -= sz; + pfn++; + addr += sz; + offset = 0; + } +} + +static void vduse_domain_bounce(struct vduse_iova_domain *domain, + dma_addr_t iova, size_t size, + enum dma_data_direction dir) +{ + struct vduse_bounce_map *map; + unsigned int offset; + void *addr; + size_t sz; + + if (iova >= domain->bounce_size) + return; + + while (size) { + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + offset = offset_in_page(iova); + sz = min_t(size_t, PAGE_SIZE - offset, size); + + if (WARN_ON(!map->bounce_page || + map->orig_phys == INVALID_PHYS_ADDR)) + return; + + addr = page_address(map->bounce_page) + offset; + do_bounce(map->orig_phys + offset, addr, sz, dir); + size -= sz; + iova += sz; + } +} + +static struct page * +vduse_domain_get_coherent_page(struct vduse_iova_domain *domain, u64 iova) +{ + u64 start = iova & PAGE_MASK; + u64 last = start + PAGE_SIZE - 1; + struct vhost_iotlb_map *map; + struct page *page = NULL; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, start, last); + if (!map) + goto out; + + page = pfn_to_page((map->addr + iova - map->start) >> PAGE_SHIFT); + get_page(page); +out: + spin_unlock(&domain->iotlb_lock); + + return page; +} + +static struct page * +vduse_domain_get_bounce_page(struct vduse_iova_domain *domain, u64 iova) +{ + struct vduse_bounce_map *map; + struct page *page = NULL; + + spin_lock(&domain->iotlb_lock); + map = &domain->bounce_maps[iova >> PAGE_SHIFT]; + if (!map->bounce_page) + goto out; + + page = map->bounce_page; + get_page(page); +out: + spin_unlock(&domain->iotlb_lock); + + return page; +} + +static void +vduse_domain_free_bounce_pages(struct vduse_iova_domain *domain) +{ + struct vduse_bounce_map *map; + unsigned long pfn, bounce_pfns; + + bounce_pfns = domain->bounce_size >> PAGE_SHIFT; + + for (pfn = 0; pfn < bounce_pfns; pfn++) { + map = &domain->bounce_maps[pfn]; + if (WARN_ON(map->orig_phys != INVALID_PHYS_ADDR)) + continue; + + if (!map->bounce_page) + continue; + + __free_page(map->bounce_page); + map->bounce_page = NULL; + } +} + +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain) +{ + if (!domain->bounce_map) + return; + + spin_lock(&domain->iotlb_lock); + if (!domain->bounce_map) + goto unlock; + + vduse_iotlb_del_range(domain, 0, domain->bounce_size - 1); + domain->bounce_map = 0; +unlock: + spin_unlock(&domain->iotlb_lock); +} + +static int vduse_domain_init_bounce_map(struct vduse_iova_domain *domain) +{ + int ret = 0; + + if (domain->bounce_map) + return 0; + + spin_lock(&domain->iotlb_lock); + if (domain->bounce_map) + goto unlock; + + ret = vduse_iotlb_add_range(domain, 0, domain->bounce_size - 1, + 0, VHOST_MAP_RW, domain->file, 0); + if (ret) + goto unlock; + + domain->bounce_map = 1; +unlock: + spin_unlock(&domain->iotlb_lock); + return ret; +} + +static dma_addr_t +vduse_domain_alloc_iova(struct iova_domain *iovad, + unsigned long size, unsigned long limit) +{ + unsigned long shift = iova_shift(iovad); + unsigned long iova_len = iova_align(iovad, size) >> shift; + unsigned long iova_pfn; + + /* + * Freeing non-power-of-two-sized allocations back into the IOVA caches + * will come back to bite us badly, so we have to waste a bit of space + * rounding up anything cacheable to make sure that can't happen. The + * order of the unadjusted size will still match upon freeing. + */ + if (iova_len < (1 << (IOVA_RANGE_CACHE_MAX_SIZE - 1))) + iova_len = roundup_pow_of_two(iova_len); + iova_pfn = alloc_iova_fast(iovad, iova_len, limit >> shift, true); + + return iova_pfn << shift; +} + +static void vduse_domain_free_iova(struct iova_domain *iovad, + dma_addr_t iova, size_t size) +{ + unsigned long shift = iova_shift(iovad); + unsigned long iova_len = iova_align(iovad, size) >> shift; + + free_iova_fast(iovad, iova >> shift, iova_len); +} + +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, + struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct iova_domain *iovad = &domain->stream_iovad; + unsigned long limit = domain->bounce_size - 1; + phys_addr_t pa = page_to_phys(page) + offset; + dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); + + if (!iova) + return DMA_MAPPING_ERROR; + + if (vduse_domain_init_bounce_map(domain)) + goto err; + + if (vduse_domain_map_bounce_page(domain, (u64)iova, (u64)size, pa)) + goto err; + + if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, iova, size, DMA_TO_DEVICE); + + return iova; +err: + vduse_domain_free_iova(iovad, iova, size); + return DMA_MAPPING_ERROR; +} + +void vduse_domain_unmap_page(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, unsigned long attrs) +{ + struct iova_domain *iovad = &domain->stream_iovad; + + if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL) + vduse_domain_bounce(domain, dma_addr, size, DMA_FROM_DEVICE); + + vduse_domain_unmap_bounce_page(domain, (u64)dma_addr, (u64)size); + vduse_domain_free_iova(iovad, dma_addr, size); +} + +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs) +{ + struct iova_domain *iovad = &domain->consistent_iovad; + unsigned long limit = domain->iova_limit; + dma_addr_t iova = vduse_domain_alloc_iova(iovad, size, limit); + void *orig = alloc_pages_exact(size, flag); + + if (!iova || !orig) + goto err; + + spin_lock(&domain->iotlb_lock); + if (vduse_iotlb_add_range(domain, (u64)iova, (u64)iova + size - 1, + virt_to_phys(orig), VHOST_MAP_RW, + domain->file, (u64)iova)) { + spin_unlock(&domain->iotlb_lock); + goto err; + } + spin_unlock(&domain->iotlb_lock); + + *dma_addr = iova; + + return orig; +err: + *dma_addr = DMA_MAPPING_ERROR; + if (orig) + free_pages_exact(orig, size); + if (iova) + vduse_domain_free_iova(iovad, iova, size); + + return NULL; +} + +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) +{ + struct iova_domain *iovad = &domain->consistent_iovad; + struct vhost_iotlb_map *map; + struct vdpa_map_file *map_file; + phys_addr_t pa; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, (u64)dma_addr, + (u64)dma_addr + size - 1); + if (WARN_ON(!map)) { + spin_unlock(&domain->iotlb_lock); + return; + } + map_file = (struct vdpa_map_file *)map->opaque; + fput(map_file->file); + kfree(map_file); + pa = map->addr; + vhost_iotlb_map_free(domain->iotlb, map); + spin_unlock(&domain->iotlb_lock); + + vduse_domain_free_iova(iovad, dma_addr, size); + free_pages_exact(phys_to_virt(pa), size); +} + +static vm_fault_t vduse_domain_mmap_fault(struct vm_fault *vmf) +{ + struct vduse_iova_domain *domain = vmf->vma->vm_private_data; + unsigned long iova = vmf->pgoff << PAGE_SHIFT; + struct page *page; + + if (!domain) + return VM_FAULT_SIGBUS; + + if (iova < domain->bounce_size) + page = vduse_domain_get_bounce_page(domain, iova); + else + page = vduse_domain_get_coherent_page(domain, iova); + + if (!page) + return VM_FAULT_SIGBUS; + + vmf->page = page; + + return 0; +} + +static const struct vm_operations_struct vduse_domain_mmap_ops = { + .fault = vduse_domain_mmap_fault, +}; + +static int vduse_domain_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct vduse_iova_domain *domain = file->private_data; + + vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND; + vma->vm_private_data = domain; + vma->vm_ops = &vduse_domain_mmap_ops; + + return 0; +} + +static int vduse_domain_release(struct inode *inode, struct file *file) +{ + struct vduse_iova_domain *domain = file->private_data; + + spin_lock(&domain->iotlb_lock); + vduse_iotlb_del_range(domain, 0, ULLONG_MAX); + vduse_domain_free_bounce_pages(domain); + spin_unlock(&domain->iotlb_lock); + put_iova_domain(&domain->stream_iovad); + put_iova_domain(&domain->consistent_iovad); + vhost_iotlb_free(domain->iotlb); + vfree(domain->bounce_maps); + kfree(domain); + + return 0; +} + +static const struct file_operations vduse_domain_fops = { + .owner = THIS_MODULE, + .mmap = vduse_domain_mmap, + .release = vduse_domain_release, +}; + +void vduse_domain_destroy(struct vduse_iova_domain *domain) +{ + fput(domain->file); +} + +struct vduse_iova_domain * +vduse_domain_create(unsigned long iova_limit, size_t bounce_size) +{ + struct vduse_iova_domain *domain; + struct file *file; + struct vduse_bounce_map *map; + unsigned long pfn, bounce_pfns; + + bounce_pfns = PAGE_ALIGN(bounce_size) >> PAGE_SHIFT; + if (iova_limit <= bounce_size) + return NULL; + + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (!domain) + return NULL; + + domain->iotlb = vhost_iotlb_alloc(0, 0); + if (!domain->iotlb) + goto err_iotlb; + + domain->iova_limit = iova_limit; + domain->bounce_size = PAGE_ALIGN(bounce_size); + domain->bounce_maps = vzalloc(bounce_pfns * + sizeof(struct vduse_bounce_map)); + if (!domain->bounce_maps) + goto err_map; + + for (pfn = 0; pfn < bounce_pfns; pfn++) { + map = &domain->bounce_maps[pfn]; + map->orig_phys = INVALID_PHYS_ADDR; + } + file = anon_inode_getfile("[vduse-domain]", &vduse_domain_fops, + domain, O_RDWR); + if (IS_ERR(file)) + goto err_file; + + domain->file = file; + spin_lock_init(&domain->iotlb_lock); + init_iova_domain(&domain->stream_iovad, + PAGE_SIZE, IOVA_START_PFN); + init_iova_domain(&domain->consistent_iovad, + PAGE_SIZE, bounce_pfns); + + return domain; +err_file: + vfree(domain->bounce_maps); +err_map: + vhost_iotlb_free(domain->iotlb); +err_iotlb: + kfree(domain); + return NULL; +} + +int vduse_domain_init(void) +{ + return iova_cache_get(); +} + +void vduse_domain_exit(void) +{ + iova_cache_put(); +} diff --git a/drivers/vdpa/vdpa_user/iova_domain.h b/drivers/vdpa/vdpa_user/iova_domain.h new file mode 100644 index 000000000000..2722d9b8e21a --- /dev/null +++ b/drivers/vdpa/vdpa_user/iova_domain.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * MMU-based software IOTLB. + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#ifndef _VDUSE_IOVA_DOMAIN_H +#define _VDUSE_IOVA_DOMAIN_H + +#include <linux/iova.h> +#include <linux/dma-mapping.h> +#include <linux/vhost_iotlb.h> + +#define IOVA_START_PFN 1 + +#define INVALID_PHYS_ADDR (~(phys_addr_t)0) + +struct vduse_bounce_map { + struct page *bounce_page; + u64 orig_phys; +}; + +struct vduse_iova_domain { + struct iova_domain stream_iovad; + struct iova_domain consistent_iovad; + struct vduse_bounce_map *bounce_maps; + size_t bounce_size; + unsigned long iova_limit; + int bounce_map; + struct vhost_iotlb *iotlb; + spinlock_t iotlb_lock; + struct file *file; +}; + +int vduse_domain_set_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb); + +void vduse_domain_clear_map(struct vduse_iova_domain *domain, + struct vhost_iotlb *iotlb); + +dma_addr_t vduse_domain_map_page(struct vduse_iova_domain *domain, + struct page *page, unsigned long offset, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + +void vduse_domain_unmap_page(struct vduse_iova_domain *domain, + dma_addr_t dma_addr, size_t size, + enum dma_data_direction dir, unsigned long attrs); + +void *vduse_domain_alloc_coherent(struct vduse_iova_domain *domain, + size_t size, dma_addr_t *dma_addr, + gfp_t flag, unsigned long attrs); + +void vduse_domain_free_coherent(struct vduse_iova_domain *domain, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs); + +void vduse_domain_reset_bounce_map(struct vduse_iova_domain *domain); + +void vduse_domain_destroy(struct vduse_iova_domain *domain); + +struct vduse_iova_domain *vduse_domain_create(unsigned long iova_limit, + size_t bounce_size); + +int vduse_domain_init(void); + +void vduse_domain_exit(void); + +#endif /* _VDUSE_IOVA_DOMAIN_H */ diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c new file mode 100644 index 000000000000..29a38ecba19e --- /dev/null +++ b/drivers/vdpa/vdpa_user/vduse_dev.c @@ -0,0 +1,1641 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDUSE: vDPA Device in Userspace + * + * Copyright (C) 2020-2021 Bytedance Inc. and/or its affiliates. All rights reserved. + * + * Author: Xie Yongji <xieyongji@bytedance.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/cdev.h> +#include <linux/device.h> +#include <linux/eventfd.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/dma-map-ops.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/vdpa.h> +#include <linux/nospec.h> +#include <uapi/linux/vduse.h> +#include <uapi/linux/vdpa.h> +#include <uapi/linux/virtio_config.h> +#include <uapi/linux/virtio_ids.h> +#include <uapi/linux/virtio_blk.h> +#include <linux/mod_devicetable.h> + +#include "iova_domain.h" + +#define DRV_AUTHOR "Yongji Xie <xieyongji@bytedance.com>" +#define DRV_DESC "vDPA Device in Userspace" +#define DRV_LICENSE "GPL v2" + +#define VDUSE_DEV_MAX (1U << MINORBITS) +#define VDUSE_BOUNCE_SIZE (64 * 1024 * 1024) +#define VDUSE_IOVA_SIZE (128 * 1024 * 1024) +#define VDUSE_MSG_DEFAULT_TIMEOUT 30 + +struct vduse_virtqueue { + u16 index; + u16 num_max; + u32 num; + u64 desc_addr; + u64 driver_addr; + u64 device_addr; + struct vdpa_vq_state state; + bool ready; + bool kicked; + spinlock_t kick_lock; + spinlock_t irq_lock; + struct eventfd_ctx *kickfd; + struct vdpa_callback cb; + struct work_struct inject; + struct work_struct kick; +}; + +struct vduse_dev; + +struct vduse_vdpa { + struct vdpa_device vdpa; + struct vduse_dev *dev; +}; + +struct vduse_dev { + struct vduse_vdpa *vdev; + struct device *dev; + struct vduse_virtqueue *vqs; + struct vduse_iova_domain *domain; + char *name; + struct mutex lock; + spinlock_t msg_lock; + u64 msg_unique; + u32 msg_timeout; + wait_queue_head_t waitq; + struct list_head send_list; + struct list_head recv_list; + struct vdpa_callback config_cb; + struct work_struct inject; + spinlock_t irq_lock; + int minor; + bool broken; + bool connected; + u64 api_version; + u64 device_features; + u64 driver_features; + u32 device_id; + u32 vendor_id; + u32 generation; + u32 config_size; + void *config; + u8 status; + u32 vq_num; + u32 vq_align; +}; + +struct vduse_dev_msg { + struct vduse_dev_request req; + struct vduse_dev_response resp; + struct list_head list; + wait_queue_head_t waitq; + bool completed; +}; + +struct vduse_control { + u64 api_version; +}; + +static DEFINE_MUTEX(vduse_lock); +static DEFINE_IDR(vduse_idr); + +static dev_t vduse_major; +static struct class *vduse_class; +static struct cdev vduse_ctrl_cdev; +static struct cdev vduse_cdev; +static struct workqueue_struct *vduse_irq_wq; + +static u32 allowed_device_id[] = { + VIRTIO_ID_BLOCK, +}; + +static inline struct vduse_dev *vdpa_to_vduse(struct vdpa_device *vdpa) +{ + struct vduse_vdpa *vdev = container_of(vdpa, struct vduse_vdpa, vdpa); + + return vdev->dev; +} + +static inline struct vduse_dev *dev_to_vduse(struct device *dev) +{ + struct vdpa_device *vdpa = dev_to_vdpa(dev); + + return vdpa_to_vduse(vdpa); +} + +static struct vduse_dev_msg *vduse_find_msg(struct list_head *head, + uint32_t request_id) +{ + struct vduse_dev_msg *msg; + + list_for_each_entry(msg, head, list) { + if (msg->req.request_id == request_id) { + list_del(&msg->list); + return msg; + } + } + + return NULL; +} + +static struct vduse_dev_msg *vduse_dequeue_msg(struct list_head *head) +{ + struct vduse_dev_msg *msg = NULL; + + if (!list_empty(head)) { + msg = list_first_entry(head, struct vduse_dev_msg, list); + list_del(&msg->list); + } + + return msg; +} + +static void vduse_enqueue_msg(struct list_head *head, + struct vduse_dev_msg *msg) +{ + list_add_tail(&msg->list, head); +} + +static void vduse_dev_broken(struct vduse_dev *dev) +{ + struct vduse_dev_msg *msg, *tmp; + + if (unlikely(dev->broken)) + return; + + list_splice_init(&dev->recv_list, &dev->send_list); + list_for_each_entry_safe(msg, tmp, &dev->send_list, list) { + list_del(&msg->list); + msg->completed = 1; + msg->resp.result = VDUSE_REQ_RESULT_FAILED; + wake_up(&msg->waitq); + } + dev->broken = true; + wake_up(&dev->waitq); +} + +static int vduse_dev_msg_sync(struct vduse_dev *dev, + struct vduse_dev_msg *msg) +{ + int ret; + + if (unlikely(dev->broken)) + return -EIO; + + init_waitqueue_head(&msg->waitq); + spin_lock(&dev->msg_lock); + if (unlikely(dev->broken)) { + spin_unlock(&dev->msg_lock); + return -EIO; + } + msg->req.request_id = dev->msg_unique++; + vduse_enqueue_msg(&dev->send_list, msg); + wake_up(&dev->waitq); + spin_unlock(&dev->msg_lock); + if (dev->msg_timeout) + ret = wait_event_killable_timeout(msg->waitq, msg->completed, + (long)dev->msg_timeout * HZ); + else + ret = wait_event_killable(msg->waitq, msg->completed); + + spin_lock(&dev->msg_lock); + if (!msg->completed) { + list_del(&msg->list); + msg->resp.result = VDUSE_REQ_RESULT_FAILED; + /* Mark the device as malfunction when there is a timeout */ + if (!ret) + vduse_dev_broken(dev); + } + ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO; + spin_unlock(&dev->msg_lock); + + return ret; +} + +static int vduse_dev_get_vq_state_packed(struct vduse_dev *dev, + struct vduse_virtqueue *vq, + struct vdpa_vq_state_packed *packed) +{ + struct vduse_dev_msg msg = { 0 }; + int ret; + + msg.req.type = VDUSE_GET_VQ_STATE; + msg.req.vq_state.index = vq->index; + + ret = vduse_dev_msg_sync(dev, &msg); + if (ret) + return ret; + + packed->last_avail_counter = + msg.resp.vq_state.packed.last_avail_counter & 0x0001; + packed->last_avail_idx = + msg.resp.vq_state.packed.last_avail_idx & 0x7FFF; + packed->last_used_counter = + msg.resp.vq_state.packed.last_used_counter & 0x0001; + packed->last_used_idx = + msg.resp.vq_state.packed.last_used_idx & 0x7FFF; + + return 0; +} + +static int vduse_dev_get_vq_state_split(struct vduse_dev *dev, + struct vduse_virtqueue *vq, + struct vdpa_vq_state_split *split) +{ + struct vduse_dev_msg msg = { 0 }; + int ret; + + msg.req.type = VDUSE_GET_VQ_STATE; + msg.req.vq_state.index = vq->index; + + ret = vduse_dev_msg_sync(dev, &msg); + if (ret) + return ret; + + split->avail_index = msg.resp.vq_state.split.avail_index; + + return 0; +} + +static int vduse_dev_set_status(struct vduse_dev *dev, u8 status) +{ + struct vduse_dev_msg msg = { 0 }; + + msg.req.type = VDUSE_SET_STATUS; + msg.req.s.status = status; + + return vduse_dev_msg_sync(dev, &msg); +} + +static int vduse_dev_update_iotlb(struct vduse_dev *dev, + u64 start, u64 last) +{ + struct vduse_dev_msg msg = { 0 }; + + if (last < start) + return -EINVAL; + + msg.req.type = VDUSE_UPDATE_IOTLB; + msg.req.iova.start = start; + msg.req.iova.last = last; + + return vduse_dev_msg_sync(dev, &msg); +} + +static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vduse_dev *dev = file->private_data; + struct vduse_dev_msg *msg; + int size = sizeof(struct vduse_dev_request); + ssize_t ret; + + if (iov_iter_count(to) < size) + return -EINVAL; + + spin_lock(&dev->msg_lock); + while (1) { + msg = vduse_dequeue_msg(&dev->send_list); + if (msg) + break; + + ret = -EAGAIN; + if (file->f_flags & O_NONBLOCK) + goto unlock; + + spin_unlock(&dev->msg_lock); + ret = wait_event_interruptible_exclusive(dev->waitq, + !list_empty(&dev->send_list)); + if (ret) + return ret; + + spin_lock(&dev->msg_lock); + } + spin_unlock(&dev->msg_lock); + ret = copy_to_iter(&msg->req, size, to); + spin_lock(&dev->msg_lock); + if (ret != size) { + ret = -EFAULT; + vduse_enqueue_msg(&dev->send_list, msg); + goto unlock; + } + vduse_enqueue_msg(&dev->recv_list, msg); +unlock: + spin_unlock(&dev->msg_lock); + + return ret; +} + +static bool is_mem_zero(const char *ptr, int size) +{ + int i; + + for (i = 0; i < size; i++) { + if (ptr[i]) + return false; + } + return true; +} + +static ssize_t vduse_dev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vduse_dev *dev = file->private_data; + struct vduse_dev_response resp; + struct vduse_dev_msg *msg; + size_t ret; + + ret = copy_from_iter(&resp, sizeof(resp), from); + if (ret != sizeof(resp)) + return -EINVAL; + + if (!is_mem_zero((const char *)resp.reserved, sizeof(resp.reserved))) + return -EINVAL; + + spin_lock(&dev->msg_lock); + msg = vduse_find_msg(&dev->recv_list, resp.request_id); + if (!msg) { + ret = -ENOENT; + goto unlock; + } + + memcpy(&msg->resp, &resp, sizeof(resp)); + msg->completed = 1; + wake_up(&msg->waitq); +unlock: + spin_unlock(&dev->msg_lock); + + return ret; +} + +static __poll_t vduse_dev_poll(struct file *file, poll_table *wait) +{ + struct vduse_dev *dev = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &dev->waitq, wait); + + spin_lock(&dev->msg_lock); + + if (unlikely(dev->broken)) + mask |= EPOLLERR; + if (!list_empty(&dev->send_list)) + mask |= EPOLLIN | EPOLLRDNORM; + if (!list_empty(&dev->recv_list)) + mask |= EPOLLOUT | EPOLLWRNORM; + + spin_unlock(&dev->msg_lock); + + return mask; +} + +static void vduse_dev_reset(struct vduse_dev *dev) +{ + int i; + struct vduse_iova_domain *domain = dev->domain; + + /* The coherent mappings are handled in vduse_dev_free_coherent() */ + if (domain->bounce_map) + vduse_domain_reset_bounce_map(domain); + + dev->status = 0; + dev->driver_features = 0; + dev->generation++; + spin_lock(&dev->irq_lock); + dev->config_cb.callback = NULL; + dev->config_cb.private = NULL; + spin_unlock(&dev->irq_lock); + flush_work(&dev->inject); + + for (i = 0; i < dev->vq_num; i++) { + struct vduse_virtqueue *vq = &dev->vqs[i]; + + vq->ready = false; + vq->desc_addr = 0; + vq->driver_addr = 0; + vq->device_addr = 0; + vq->num = 0; + memset(&vq->state, 0, sizeof(vq->state)); + + spin_lock(&vq->kick_lock); + vq->kicked = false; + if (vq->kickfd) + eventfd_ctx_put(vq->kickfd); + vq->kickfd = NULL; + spin_unlock(&vq->kick_lock); + + spin_lock(&vq->irq_lock); + vq->cb.callback = NULL; + vq->cb.private = NULL; + spin_unlock(&vq->irq_lock); + flush_work(&vq->inject); + flush_work(&vq->kick); + } +} + +static int vduse_vdpa_set_vq_address(struct vdpa_device *vdpa, u16 idx, + u64 desc_area, u64 driver_area, + u64 device_area) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + vq->desc_addr = desc_area; + vq->driver_addr = driver_area; + vq->device_addr = device_area; + + return 0; +} + +static void vduse_vq_kick(struct vduse_virtqueue *vq) +{ + spin_lock(&vq->kick_lock); + if (!vq->ready) + goto unlock; + + if (vq->kickfd) + eventfd_signal(vq->kickfd, 1); + else + vq->kicked = true; +unlock: + spin_unlock(&vq->kick_lock); +} + +static void vduse_vq_kick_work(struct work_struct *work) +{ + struct vduse_virtqueue *vq = container_of(work, + struct vduse_virtqueue, kick); + + vduse_vq_kick(vq); +} + +static void vduse_vdpa_kick_vq(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + if (!eventfd_signal_allowed()) { + schedule_work(&vq->kick); + return; + } + vduse_vq_kick(vq); +} + +static void vduse_vdpa_set_vq_cb(struct vdpa_device *vdpa, u16 idx, + struct vdpa_callback *cb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + spin_lock(&vq->irq_lock); + vq->cb.callback = cb->callback; + vq->cb.private = cb->private; + spin_unlock(&vq->irq_lock); +} + +static void vduse_vdpa_set_vq_num(struct vdpa_device *vdpa, u16 idx, u32 num) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + vq->num = num; +} + +static void vduse_vdpa_set_vq_ready(struct vdpa_device *vdpa, + u16 idx, bool ready) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + vq->ready = ready; +} + +static bool vduse_vdpa_get_vq_ready(struct vdpa_device *vdpa, u16 idx) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + return vq->ready; +} + +static int vduse_vdpa_set_vq_state(struct vdpa_device *vdpa, u16 idx, + const struct vdpa_vq_state *state) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + vq->state.packed.last_avail_counter = + state->packed.last_avail_counter; + vq->state.packed.last_avail_idx = state->packed.last_avail_idx; + vq->state.packed.last_used_counter = + state->packed.last_used_counter; + vq->state.packed.last_used_idx = state->packed.last_used_idx; + } else + vq->state.split.avail_index = state->split.avail_index; + + return 0; +} + +static int vduse_vdpa_get_vq_state(struct vdpa_device *vdpa, u16 idx, + struct vdpa_vq_state *state) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + struct vduse_virtqueue *vq = &dev->vqs[idx]; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) + return vduse_dev_get_vq_state_packed(dev, vq, &state->packed); + + return vduse_dev_get_vq_state_split(dev, vq, &state->split); +} + +static u32 vduse_vdpa_get_vq_align(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->vq_align; +} + +static u64 vduse_vdpa_get_features(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->device_features; +} + +static int vduse_vdpa_set_features(struct vdpa_device *vdpa, u64 features) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + dev->driver_features = features; + return 0; +} + +static void vduse_vdpa_set_config_cb(struct vdpa_device *vdpa, + struct vdpa_callback *cb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + spin_lock(&dev->irq_lock); + dev->config_cb.callback = cb->callback; + dev->config_cb.private = cb->private; + spin_unlock(&dev->irq_lock); +} + +static u16 vduse_vdpa_get_vq_num_max(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + u16 num_max = 0; + int i; + + for (i = 0; i < dev->vq_num; i++) + if (num_max < dev->vqs[i].num_max) + num_max = dev->vqs[i].num_max; + + return num_max; +} + +static u32 vduse_vdpa_get_device_id(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->device_id; +} + +static u32 vduse_vdpa_get_vendor_id(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->vendor_id; +} + +static u8 vduse_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->status; +} + +static void vduse_vdpa_set_status(struct vdpa_device *vdpa, u8 status) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (vduse_dev_set_status(dev, status)) + return; + + dev->status = status; +} + +static size_t vduse_vdpa_get_config_size(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->config_size; +} + +static void vduse_vdpa_get_config(struct vdpa_device *vdpa, unsigned int offset, + void *buf, unsigned int len) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (len > dev->config_size - offset) + return; + + memcpy(buf, dev->config + offset, len); +} + +static void vduse_vdpa_set_config(struct vdpa_device *vdpa, unsigned int offset, + const void *buf, unsigned int len) +{ + /* Now we only support read-only configuration space */ +} + +static int vduse_vdpa_reset(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + if (vduse_dev_set_status(dev, 0)) + return -EIO; + + vduse_dev_reset(dev); + + return 0; +} + +static u32 vduse_vdpa_get_generation(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + return dev->generation; +} + +static int vduse_vdpa_set_map(struct vdpa_device *vdpa, + struct vhost_iotlb *iotlb) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + int ret; + + ret = vduse_domain_set_map(dev->domain, iotlb); + if (ret) + return ret; + + ret = vduse_dev_update_iotlb(dev, 0ULL, ULLONG_MAX); + if (ret) { + vduse_domain_clear_map(dev->domain, iotlb); + return ret; + } + + return 0; +} + +static void vduse_vdpa_free(struct vdpa_device *vdpa) +{ + struct vduse_dev *dev = vdpa_to_vduse(vdpa); + + dev->vdev = NULL; +} + +static const struct vdpa_config_ops vduse_vdpa_config_ops = { + .set_vq_address = vduse_vdpa_set_vq_address, + .kick_vq = vduse_vdpa_kick_vq, + .set_vq_cb = vduse_vdpa_set_vq_cb, + .set_vq_num = vduse_vdpa_set_vq_num, + .set_vq_ready = vduse_vdpa_set_vq_ready, + .get_vq_ready = vduse_vdpa_get_vq_ready, + .set_vq_state = vduse_vdpa_set_vq_state, + .get_vq_state = vduse_vdpa_get_vq_state, + .get_vq_align = vduse_vdpa_get_vq_align, + .get_features = vduse_vdpa_get_features, + .set_features = vduse_vdpa_set_features, + .set_config_cb = vduse_vdpa_set_config_cb, + .get_vq_num_max = vduse_vdpa_get_vq_num_max, + .get_device_id = vduse_vdpa_get_device_id, + .get_vendor_id = vduse_vdpa_get_vendor_id, + .get_status = vduse_vdpa_get_status, + .set_status = vduse_vdpa_set_status, + .get_config_size = vduse_vdpa_get_config_size, + .get_config = vduse_vdpa_get_config, + .set_config = vduse_vdpa_set_config, + .get_generation = vduse_vdpa_get_generation, + .reset = vduse_vdpa_reset, + .set_map = vduse_vdpa_set_map, + .free = vduse_vdpa_free, +}; + +static dma_addr_t vduse_dev_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return vduse_domain_map_page(domain, page, offset, size, dir, attrs); +} + +static void vduse_dev_unmap_page(struct device *dev, dma_addr_t dma_addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return vduse_domain_unmap_page(domain, dma_addr, size, dir, attrs); +} + +static void *vduse_dev_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + unsigned long iova; + void *addr; + + *dma_addr = DMA_MAPPING_ERROR; + addr = vduse_domain_alloc_coherent(domain, size, + (dma_addr_t *)&iova, flag, attrs); + if (!addr) + return NULL; + + *dma_addr = (dma_addr_t)iova; + + return addr; +} + +static void vduse_dev_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + unsigned long attrs) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + vduse_domain_free_coherent(domain, size, vaddr, dma_addr, attrs); +} + +static size_t vduse_dev_max_mapping_size(struct device *dev) +{ + struct vduse_dev *vdev = dev_to_vduse(dev); + struct vduse_iova_domain *domain = vdev->domain; + + return domain->bounce_size; +} + +static const struct dma_map_ops vduse_dev_dma_ops = { + .map_page = vduse_dev_map_page, + .unmap_page = vduse_dev_unmap_page, + .alloc = vduse_dev_alloc_coherent, + .free = vduse_dev_free_coherent, + .max_mapping_size = vduse_dev_max_mapping_size, +}; + +static unsigned int perm_to_file_flags(u8 perm) +{ + unsigned int flags = 0; + + switch (perm) { + case VDUSE_ACCESS_WO: + flags |= O_WRONLY; + break; + case VDUSE_ACCESS_RO: + flags |= O_RDONLY; + break; + case VDUSE_ACCESS_RW: + flags |= O_RDWR; + break; + default: + WARN(1, "invalidate vhost IOTLB permission\n"); + break; + } + + return flags; +} + +static int vduse_kickfd_setup(struct vduse_dev *dev, + struct vduse_vq_eventfd *eventfd) +{ + struct eventfd_ctx *ctx = NULL; + struct vduse_virtqueue *vq; + u32 index; + + if (eventfd->index >= dev->vq_num) + return -EINVAL; + + index = array_index_nospec(eventfd->index, dev->vq_num); + vq = &dev->vqs[index]; + if (eventfd->fd >= 0) { + ctx = eventfd_ctx_fdget(eventfd->fd); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + } else if (eventfd->fd != VDUSE_EVENTFD_DEASSIGN) + return 0; + + spin_lock(&vq->kick_lock); + if (vq->kickfd) + eventfd_ctx_put(vq->kickfd); + vq->kickfd = ctx; + if (vq->ready && vq->kicked && vq->kickfd) { + eventfd_signal(vq->kickfd, 1); + vq->kicked = false; + } + spin_unlock(&vq->kick_lock); + + return 0; +} + +static bool vduse_dev_is_ready(struct vduse_dev *dev) +{ + int i; + + for (i = 0; i < dev->vq_num; i++) + if (!dev->vqs[i].num_max) + return false; + + return true; +} + +static void vduse_dev_irq_inject(struct work_struct *work) +{ + struct vduse_dev *dev = container_of(work, struct vduse_dev, inject); + + spin_lock_irq(&dev->irq_lock); + if (dev->config_cb.callback) + dev->config_cb.callback(dev->config_cb.private); + spin_unlock_irq(&dev->irq_lock); +} + +static void vduse_vq_irq_inject(struct work_struct *work) +{ + struct vduse_virtqueue *vq = container_of(work, + struct vduse_virtqueue, inject); + + spin_lock_irq(&vq->irq_lock); + if (vq->ready && vq->cb.callback) + vq->cb.callback(vq->cb.private); + spin_unlock_irq(&vq->irq_lock); +} + +static long vduse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct vduse_dev *dev = file->private_data; + void __user *argp = (void __user *)arg; + int ret; + + if (unlikely(dev->broken)) + return -EPERM; + + switch (cmd) { + case VDUSE_IOTLB_GET_FD: { + struct vduse_iotlb_entry entry; + struct vhost_iotlb_map *map; + struct vdpa_map_file *map_file; + struct vduse_iova_domain *domain = dev->domain; + struct file *f = NULL; + + ret = -EFAULT; + if (copy_from_user(&entry, argp, sizeof(entry))) + break; + + ret = -EINVAL; + if (entry.start > entry.last) + break; + + spin_lock(&domain->iotlb_lock); + map = vhost_iotlb_itree_first(domain->iotlb, + entry.start, entry.last); + if (map) { + map_file = (struct vdpa_map_file *)map->opaque; + f = get_file(map_file->file); + entry.offset = map_file->offset; + entry.start = map->start; + entry.last = map->last; + entry.perm = map->perm; + } + spin_unlock(&domain->iotlb_lock); + ret = -EINVAL; + if (!f) + break; + + ret = -EFAULT; + if (copy_to_user(argp, &entry, sizeof(entry))) { + fput(f); + break; + } + ret = receive_fd(f, perm_to_file_flags(entry.perm)); + fput(f); + break; + } + case VDUSE_DEV_GET_FEATURES: + /* + * Just mirror what driver wrote here. + * The driver is expected to check FEATURE_OK later. + */ + ret = put_user(dev->driver_features, (u64 __user *)argp); + break; + case VDUSE_DEV_SET_CONFIG: { + struct vduse_config_data config; + unsigned long size = offsetof(struct vduse_config_data, + buffer); + + ret = -EFAULT; + if (copy_from_user(&config, argp, size)) + break; + + ret = -EINVAL; + if (config.length == 0 || + config.length > dev->config_size - config.offset) + break; + + ret = -EFAULT; + if (copy_from_user(dev->config + config.offset, argp + size, + config.length)) + break; + + ret = 0; + break; + } + case VDUSE_DEV_INJECT_CONFIG_IRQ: + ret = 0; + queue_work(vduse_irq_wq, &dev->inject); + break; + case VDUSE_VQ_SETUP: { + struct vduse_vq_config config; + u32 index; + + ret = -EFAULT; + if (copy_from_user(&config, argp, sizeof(config))) + break; + + ret = -EINVAL; + if (config.index >= dev->vq_num) + break; + + if (!is_mem_zero((const char *)config.reserved, + sizeof(config.reserved))) + break; + + index = array_index_nospec(config.index, dev->vq_num); + dev->vqs[index].num_max = config.max_size; + ret = 0; + break; + } + case VDUSE_VQ_GET_INFO: { + struct vduse_vq_info vq_info; + struct vduse_virtqueue *vq; + u32 index; + + ret = -EFAULT; + if (copy_from_user(&vq_info, argp, sizeof(vq_info))) + break; + + ret = -EINVAL; + if (vq_info.index >= dev->vq_num) + break; + + index = array_index_nospec(vq_info.index, dev->vq_num); + vq = &dev->vqs[index]; + vq_info.desc_addr = vq->desc_addr; + vq_info.driver_addr = vq->driver_addr; + vq_info.device_addr = vq->device_addr; + vq_info.num = vq->num; + + if (dev->driver_features & BIT_ULL(VIRTIO_F_RING_PACKED)) { + vq_info.packed.last_avail_counter = + vq->state.packed.last_avail_counter; + vq_info.packed.last_avail_idx = + vq->state.packed.last_avail_idx; + vq_info.packed.last_used_counter = + vq->state.packed.last_used_counter; + vq_info.packed.last_used_idx = + vq->state.packed.last_used_idx; + } else + vq_info.split.avail_index = + vq->state.split.avail_index; + + vq_info.ready = vq->ready; + + ret = -EFAULT; + if (copy_to_user(argp, &vq_info, sizeof(vq_info))) + break; + + ret = 0; + break; + } + case VDUSE_VQ_SETUP_KICKFD: { + struct vduse_vq_eventfd eventfd; + + ret = -EFAULT; + if (copy_from_user(&eventfd, argp, sizeof(eventfd))) + break; + + ret = vduse_kickfd_setup(dev, &eventfd); + break; + } + case VDUSE_VQ_INJECT_IRQ: { + u32 index; + + ret = -EFAULT; + if (get_user(index, (u32 __user *)argp)) + break; + + ret = -EINVAL; + if (index >= dev->vq_num) + break; + + ret = 0; + index = array_index_nospec(index, dev->vq_num); + queue_work(vduse_irq_wq, &dev->vqs[index].inject); + break; + } + default: + ret = -ENOIOCTLCMD; + break; + } + + return ret; +} + +static int vduse_dev_release(struct inode *inode, struct file *file) +{ + struct vduse_dev *dev = file->private_data; + + spin_lock(&dev->msg_lock); + /* Make sure the inflight messages can processed after reconncection */ + list_splice_init(&dev->recv_list, &dev->send_list); + spin_unlock(&dev->msg_lock); + dev->connected = false; + + return 0; +} + +static struct vduse_dev *vduse_dev_get_from_minor(int minor) +{ + struct vduse_dev *dev; + + mutex_lock(&vduse_lock); + dev = idr_find(&vduse_idr, minor); + mutex_unlock(&vduse_lock); + + return dev; +} + +static int vduse_dev_open(struct inode *inode, struct file *file) +{ + int ret; + struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode)); + + if (!dev) + return -ENODEV; + + ret = -EBUSY; + mutex_lock(&dev->lock); + if (dev->connected) + goto unlock; + + ret = 0; + dev->connected = true; + file->private_data = dev; +unlock: + mutex_unlock(&dev->lock); + + return ret; +} + +static const struct file_operations vduse_dev_fops = { + .owner = THIS_MODULE, + .open = vduse_dev_open, + .release = vduse_dev_release, + .read_iter = vduse_dev_read_iter, + .write_iter = vduse_dev_write_iter, + .poll = vduse_dev_poll, + .unlocked_ioctl = vduse_dev_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static struct vduse_dev *vduse_dev_create(void) +{ + struct vduse_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL); + + if (!dev) + return NULL; + + mutex_init(&dev->lock); + spin_lock_init(&dev->msg_lock); + INIT_LIST_HEAD(&dev->send_list); + INIT_LIST_HEAD(&dev->recv_list); + spin_lock_init(&dev->irq_lock); + + INIT_WORK(&dev->inject, vduse_dev_irq_inject); + init_waitqueue_head(&dev->waitq); + + return dev; +} + +static void vduse_dev_destroy(struct vduse_dev *dev) +{ + kfree(dev); +} + +static struct vduse_dev *vduse_find_dev(const char *name) +{ + struct vduse_dev *dev; + int id; + + idr_for_each_entry(&vduse_idr, dev, id) + if (!strcmp(dev->name, name)) + return dev; + + return NULL; +} + +static int vduse_destroy_dev(char *name) +{ + struct vduse_dev *dev = vduse_find_dev(name); + + if (!dev) + return -EINVAL; + + mutex_lock(&dev->lock); + if (dev->vdev || dev->connected) { + mutex_unlock(&dev->lock); + return -EBUSY; + } + dev->connected = true; + mutex_unlock(&dev->lock); + + vduse_dev_reset(dev); + device_destroy(vduse_class, MKDEV(MAJOR(vduse_major), dev->minor)); + idr_remove(&vduse_idr, dev->minor); + kvfree(dev->config); + kfree(dev->vqs); + vduse_domain_destroy(dev->domain); + kfree(dev->name); + vduse_dev_destroy(dev); + module_put(THIS_MODULE); + + return 0; +} + +static bool device_is_allowed(u32 device_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(allowed_device_id); i++) + if (allowed_device_id[i] == device_id) + return true; + + return false; +} + +static bool features_is_valid(u64 features) +{ + if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) + return false; + + /* Now we only support read-only configuration space */ + if (features & (1ULL << VIRTIO_BLK_F_CONFIG_WCE)) + return false; + + return true; +} + +static bool vduse_validate_config(struct vduse_dev_config *config) +{ + if (!is_mem_zero((const char *)config->reserved, + sizeof(config->reserved))) + return false; + + if (config->vq_align > PAGE_SIZE) + return false; + + if (config->config_size > PAGE_SIZE) + return false; + + if (!device_is_allowed(config->device_id)) + return false; + + if (!features_is_valid(config->features)) + return false; + + return true; +} + +static ssize_t msg_timeout_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + + return sysfs_emit(buf, "%u\n", dev->msg_timeout); +} + +static ssize_t msg_timeout_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct vduse_dev *dev = dev_get_drvdata(device); + int ret; + + ret = kstrtouint(buf, 10, &dev->msg_timeout); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(msg_timeout); + +static struct attribute *vduse_dev_attrs[] = { + &dev_attr_msg_timeout.attr, + NULL +}; + +ATTRIBUTE_GROUPS(vduse_dev); + +static int vduse_create_dev(struct vduse_dev_config *config, + void *config_buf, u64 api_version) +{ + int i, ret; + struct vduse_dev *dev; + + ret = -EEXIST; + if (vduse_find_dev(config->name)) + goto err; + + ret = -ENOMEM; + dev = vduse_dev_create(); + if (!dev) + goto err; + + dev->api_version = api_version; + dev->device_features = config->features; + dev->device_id = config->device_id; + dev->vendor_id = config->vendor_id; + dev->name = kstrdup(config->name, GFP_KERNEL); + if (!dev->name) + goto err_str; + + dev->domain = vduse_domain_create(VDUSE_IOVA_SIZE - 1, + VDUSE_BOUNCE_SIZE); + if (!dev->domain) + goto err_domain; + + dev->config = config_buf; + dev->config_size = config->config_size; + dev->vq_align = config->vq_align; + dev->vq_num = config->vq_num; + dev->vqs = kcalloc(dev->vq_num, sizeof(*dev->vqs), GFP_KERNEL); + if (!dev->vqs) + goto err_vqs; + + for (i = 0; i < dev->vq_num; i++) { + dev->vqs[i].index = i; + INIT_WORK(&dev->vqs[i].inject, vduse_vq_irq_inject); + INIT_WORK(&dev->vqs[i].kick, vduse_vq_kick_work); + spin_lock_init(&dev->vqs[i].kick_lock); + spin_lock_init(&dev->vqs[i].irq_lock); + } + + ret = idr_alloc(&vduse_idr, dev, 1, VDUSE_DEV_MAX, GFP_KERNEL); + if (ret < 0) + goto err_idr; + + dev->minor = ret; + dev->msg_timeout = VDUSE_MSG_DEFAULT_TIMEOUT; + dev->dev = device_create(vduse_class, NULL, + MKDEV(MAJOR(vduse_major), dev->minor), + dev, "%s", config->name); + if (IS_ERR(dev->dev)) { + ret = PTR_ERR(dev->dev); + goto err_dev; + } + __module_get(THIS_MODULE); + + return 0; +err_dev: + idr_remove(&vduse_idr, dev->minor); +err_idr: + kfree(dev->vqs); +err_vqs: + vduse_domain_destroy(dev->domain); +err_domain: + kfree(dev->name); +err_str: + vduse_dev_destroy(dev); +err: + kvfree(config_buf); + return ret; +} + +static long vduse_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + void __user *argp = (void __user *)arg; + struct vduse_control *control = file->private_data; + + mutex_lock(&vduse_lock); + switch (cmd) { + case VDUSE_GET_API_VERSION: + ret = put_user(control->api_version, (u64 __user *)argp); + break; + case VDUSE_SET_API_VERSION: { + u64 api_version; + + ret = -EFAULT; + if (get_user(api_version, (u64 __user *)argp)) + break; + + ret = -EINVAL; + if (api_version > VDUSE_API_VERSION) + break; + + ret = 0; + control->api_version = api_version; + break; + } + case VDUSE_CREATE_DEV: { + struct vduse_dev_config config; + unsigned long size = offsetof(struct vduse_dev_config, config); + void *buf; + + ret = -EFAULT; + if (copy_from_user(&config, argp, size)) + break; + + ret = -EINVAL; + if (vduse_validate_config(&config) == false) + break; + + buf = vmemdup_user(argp + size, config.config_size); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + break; + } + config.name[VDUSE_NAME_MAX - 1] = '\0'; + ret = vduse_create_dev(&config, buf, control->api_version); + break; + } + case VDUSE_DESTROY_DEV: { + char name[VDUSE_NAME_MAX]; + + ret = -EFAULT; + if (copy_from_user(name, argp, VDUSE_NAME_MAX)) + break; + + name[VDUSE_NAME_MAX - 1] = '\0'; + ret = vduse_destroy_dev(name); + break; + } + default: + ret = -EINVAL; + break; + } + mutex_unlock(&vduse_lock); + + return ret; +} + +static int vduse_release(struct inode *inode, struct file *file) +{ + struct vduse_control *control = file->private_data; + + kfree(control); + return 0; +} + +static int vduse_open(struct inode *inode, struct file *file) +{ + struct vduse_control *control; + + control = kmalloc(sizeof(struct vduse_control), GFP_KERNEL); + if (!control) + return -ENOMEM; + + control->api_version = VDUSE_API_VERSION; + file->private_data = control; + + return 0; +} + +static const struct file_operations vduse_ctrl_fops = { + .owner = THIS_MODULE, + .open = vduse_open, + .release = vduse_release, + .unlocked_ioctl = vduse_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .llseek = noop_llseek, +}; + +static char *vduse_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "vduse/%s", dev_name(dev)); +} + +static void vduse_mgmtdev_release(struct device *dev) +{ +} + +static struct device vduse_mgmtdev = { + .init_name = "vduse", + .release = vduse_mgmtdev_release, +}; + +static struct vdpa_mgmt_dev mgmt_dev; + +static int vduse_dev_init_vdpa(struct vduse_dev *dev, const char *name) +{ + struct vduse_vdpa *vdev; + int ret; + + if (dev->vdev) + return -EEXIST; + + vdev = vdpa_alloc_device(struct vduse_vdpa, vdpa, dev->dev, + &vduse_vdpa_config_ops, name, true); + if (IS_ERR(vdev)) + return PTR_ERR(vdev); + + dev->vdev = vdev; + vdev->dev = dev; + vdev->vdpa.dev.dma_mask = &vdev->vdpa.dev.coherent_dma_mask; + ret = dma_set_mask_and_coherent(&vdev->vdpa.dev, DMA_BIT_MASK(64)); + if (ret) { + put_device(&vdev->vdpa.dev); + return ret; + } + set_dma_ops(&vdev->vdpa.dev, &vduse_dev_dma_ops); + vdev->vdpa.dma_dev = &vdev->vdpa.dev; + vdev->vdpa.mdev = &mgmt_dev; + + return 0; +} + +static int vdpa_dev_add(struct vdpa_mgmt_dev *mdev, const char *name) +{ + struct vduse_dev *dev; + int ret; + + mutex_lock(&vduse_lock); + dev = vduse_find_dev(name); + if (!dev || !vduse_dev_is_ready(dev)) { + mutex_unlock(&vduse_lock); + return -EINVAL; + } + ret = vduse_dev_init_vdpa(dev, name); + mutex_unlock(&vduse_lock); + if (ret) + return ret; + + ret = _vdpa_register_device(&dev->vdev->vdpa, dev->vq_num); + if (ret) { + put_device(&dev->vdev->vdpa.dev); + return ret; + } + + return 0; +} + +static void vdpa_dev_del(struct vdpa_mgmt_dev *mdev, struct vdpa_device *dev) +{ + _vdpa_unregister_device(dev); +} + +static const struct vdpa_mgmtdev_ops vdpa_dev_mgmtdev_ops = { + .dev_add = vdpa_dev_add, + .dev_del = vdpa_dev_del, +}; + +static struct virtio_device_id id_table[] = { + { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; + +static struct vdpa_mgmt_dev mgmt_dev = { + .device = &vduse_mgmtdev, + .id_table = id_table, + .ops = &vdpa_dev_mgmtdev_ops, +}; + +static int vduse_mgmtdev_init(void) +{ + int ret; + + ret = device_register(&vduse_mgmtdev); + if (ret) + return ret; + + ret = vdpa_mgmtdev_register(&mgmt_dev); + if (ret) + goto err; + + return 0; +err: + device_unregister(&vduse_mgmtdev); + return ret; +} + +static void vduse_mgmtdev_exit(void) +{ + vdpa_mgmtdev_unregister(&mgmt_dev); + device_unregister(&vduse_mgmtdev); +} + +static int vduse_init(void) +{ + int ret; + struct device *dev; + + vduse_class = class_create(THIS_MODULE, "vduse"); + if (IS_ERR(vduse_class)) + return PTR_ERR(vduse_class); + + vduse_class->devnode = vduse_devnode; + vduse_class->dev_groups = vduse_dev_groups; + + ret = alloc_chrdev_region(&vduse_major, 0, VDUSE_DEV_MAX, "vduse"); + if (ret) + goto err_chardev_region; + + /* /dev/vduse/control */ + cdev_init(&vduse_ctrl_cdev, &vduse_ctrl_fops); + vduse_ctrl_cdev.owner = THIS_MODULE; + ret = cdev_add(&vduse_ctrl_cdev, vduse_major, 1); + if (ret) + goto err_ctrl_cdev; + + dev = device_create(vduse_class, NULL, vduse_major, NULL, "control"); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto err_device; + } + + /* /dev/vduse/$DEVICE */ + cdev_init(&vduse_cdev, &vduse_dev_fops); + vduse_cdev.owner = THIS_MODULE; + ret = cdev_add(&vduse_cdev, MKDEV(MAJOR(vduse_major), 1), + VDUSE_DEV_MAX - 1); + if (ret) + goto err_cdev; + + vduse_irq_wq = alloc_workqueue("vduse-irq", + WQ_HIGHPRI | WQ_SYSFS | WQ_UNBOUND, 0); + if (!vduse_irq_wq) + goto err_wq; + + ret = vduse_domain_init(); + if (ret) + goto err_domain; + + ret = vduse_mgmtdev_init(); + if (ret) + goto err_mgmtdev; + + return 0; +err_mgmtdev: + vduse_domain_exit(); +err_domain: + destroy_workqueue(vduse_irq_wq); +err_wq: + cdev_del(&vduse_cdev); +err_cdev: + device_destroy(vduse_class, vduse_major); +err_device: + cdev_del(&vduse_ctrl_cdev); +err_ctrl_cdev: + unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); +err_chardev_region: + class_destroy(vduse_class); + return ret; +} +module_init(vduse_init); + +static void vduse_exit(void) +{ + vduse_mgmtdev_exit(); + vduse_domain_exit(); + destroy_workqueue(vduse_irq_wq); + cdev_del(&vduse_cdev); + device_destroy(vduse_class, vduse_major); + cdev_del(&vduse_ctrl_cdev); + unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); + class_destroy(vduse_class); +} +module_exit(vduse_exit); + +MODULE_LICENSE(DRV_LICENSE); +MODULE_AUTHOR(DRV_AUTHOR); +MODULE_DESCRIPTION(DRV_DESC); diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index fe0527329857..5bcd00246d2e 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -189,10 +189,20 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, u8 status) } vp_modern_set_status(mdev, status); +} - if (!(status & VIRTIO_CONFIG_S_DRIVER_OK) && - (s & VIRTIO_CONFIG_S_DRIVER_OK)) +static int vp_vdpa_reset(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + struct virtio_pci_modern_device *mdev = &vp_vdpa->mdev; + u8 s = vp_vdpa_get_status(vdpa); + + vp_modern_set_status(mdev, 0); + + if (s & VIRTIO_CONFIG_S_DRIVER_OK) vp_vdpa_free_irq(vp_vdpa); + + return 0; } static u16 vp_vdpa_get_vq_num_max(struct vdpa_device *vdpa) @@ -398,6 +408,7 @@ static const struct vdpa_config_ops vp_vdpa_ops = { .set_features = vp_vdpa_set_features, .get_status = vp_vdpa_get_status, .set_status = vp_vdpa_set_status, + .reset = vp_vdpa_reset, .get_vq_num_max = vp_vdpa_get_vq_num_max, .get_vq_state = vp_vdpa_get_vq_state, .get_vq_notification = vp_vdpa_get_vq_notification, @@ -435,7 +446,7 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; vp_vdpa = vdpa_alloc_device(struct vp_vdpa, vdpa, - dev, &vp_vdpa_ops, NULL); + dev, &vp_vdpa_ops, NULL, false); if (IS_ERR(vp_vdpa)) { dev_err(dev, "vp_vdpa: Failed to allocate vDPA structure\n"); return PTR_ERR(vp_vdpa); |