From 5a2a5b65d5d67279be9e1f0e4b9baf39ee594cb1 Mon Sep 17 00:00:00 2001 From: Mark Zhang Date: Mon, 16 Jun 2025 11:26:20 +0300 Subject: RDMA/core: Add driver APIs pre_destroy_cq() and post_destroy_cq() Currently in ib_free_cq, it disables IRQ or cancel the CQ work before driver destroy_cq. This isn't good as a new IRQ or a CQ work can be submitted immediately after disabling IRQ or canceling CQ work, which may run concurrently with destroy_cq and cause crashes. The right flow should be: 1. Driver disables CQ to make sure no new CQ event will be submitted; 2. Disables IRQ or Cancels CQ work in core layer, to make sure no CQ polling work is running; 3. Free all resources to destroy the CQ. This patch adds 2 driver APIs: - pre_destroy_cq(): Disable a CQ to prevent it from generating any new work completions, but not free any kernel resources; - post_destroy_cq(): Free all kernel resources. In ib_free_cq, the IRQ is disabled or CQ work is canceled after pre_destroy_cq, and before post_destroy_cq. Fixes: 14d3a3b2498e ("IB: add a proper completion queue abstraction") Signed-off-by: Mark Zhang Link: https://patch.msgid.link/b5f7ae3d75f44a3e15ff3f4eb2bbdea13e06b97f.1750062328.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index af43a8d2a74a..38f68d245fa6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2489,6 +2489,15 @@ struct ib_device_ops { int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); + /** + * pre_destroy_cq - Prevent a cq from generating any new work + * completions, but not free any kernel resources + */ + int (*pre_destroy_cq)(struct ib_cq *cq); + /** + * post_destroy_cq - Free all kernel resources + */ + void (*post_destroy_cq)(struct ib_cq *cq); struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, -- cgit v1.2.3 From 16e2707cf15e09234445d40ddd76f11240be8767 Mon Sep 17 00:00:00 2001 From: "Yury Norov [NVIDIA]" Date: Wed, 4 Jun 2025 15:39:37 -0400 Subject: cpumask: add cpumask_clear_cpus() When user wants to clear a range in cpumask, the only option the API provides now is a for-loop, like: for_each_cpu_from(cpu, mask) { if (cpu >= ncpus) break; __cpumask_clear_cpu(cpu, mask); } In the bitmap API we have bitmap_clear() for that, which is significantly faster than a for-loop. Propagate it to cpumasks. Signed-off-by: Yury Norov [NVIDIA] Link: https://patch.msgid.link/20250604193947.11834-2-yury.norov@gmail.com Signed-off-by: Leon Romanovsky --- include/linux/cpumask.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 7ae80a7ca81e..ede95bbe8b80 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -609,6 +609,18 @@ void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } +/** + * cpumask_clear_cpus - clear cpus in a cpumask + * @dstp: the cpumask pointer + * @cpu: cpu number (< nr_cpu_ids) + * @ncpus: number of cpus to clear (< nr_cpu_ids) + */ +static __always_inline void cpumask_clear_cpus(struct cpumask *dstp, + unsigned int cpu, unsigned int ncpus) +{ + cpumask_check(cpu + ncpus - 1); + bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus); +} /** * cpumask_clear_cpu - clear a cpu in a cpumask -- cgit v1.2.3 From 8cffca866ba86cbf0d097e56521b17d830956d4a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 17 Jun 2025 11:44:01 +0300 Subject: RDMA/core: Extend RDMA device registration to be net namespace aware Presently, RDMA devices are always registered within the init network namespace, even if the associated devlink device's namespace was changed via a devlink reload. This mismatch leads to discrepancies between the network namespace of the devlink device and that of the RDMA device. Therefore, extend the RDMA device allocation API to optionally take the net namespace. This isn't limited to devices that support devlink but allows all users to provide the network namespace if they need to do so. If a network namespace is provided during device allocation, it's up to the caller to make sure the namespace stays valid until ib_register_device() is called. Signed-off-by: Shay Drory Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 14 ++++++++++++-- drivers/infiniband/sw/rdmavt/vt.c | 2 +- include/rdma/ib_verbs.h | 11 +++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 468ed6bd4722..c0f8b8cba7c0 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -557,6 +557,8 @@ static void rdma_init_coredev(struct ib_core_device *coredev, /** * _ib_alloc_device - allocate an IB device struct * @size:size of structure to allocate + * @net: network namespace device should be located in, namespace + * must stay valid until ib_register_device() is completed. * * Low-level drivers should use ib_alloc_device() to allocate &struct * ib_device. @size is the size of the structure to be allocated, @@ -564,7 +566,7 @@ static void rdma_init_coredev(struct ib_core_device *coredev, * ib_dealloc_device() must be used to free structures allocated with * ib_alloc_device(). */ -struct ib_device *_ib_alloc_device(size_t size) +struct ib_device *_ib_alloc_device(size_t size, struct net *net) { struct ib_device *device; unsigned int i; @@ -581,7 +583,15 @@ struct ib_device *_ib_alloc_device(size_t size) return NULL; } - rdma_init_coredev(&device->coredev, device, &init_net); + /* ib_devices_shared_netns can't change while we have active namespaces + * in the system which means either init_net is passed or the user has + * no idea what they are doing. + * + * To avoid breaking backward compatibility, when in shared mode, + * force to init the device in the init_net. + */ + net = ib_devices_shared_netns ? &init_net : net; + rdma_init_coredev(&device->coredev, device, net); INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->qp_open_list_lock); diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 5499025e8a0a..d22d610c2696 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -49,7 +49,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports) { struct rvt_dev_info *rdi; - rdi = container_of(_ib_alloc_device(size), struct rvt_dev_info, ibdev); + rdi = container_of(_ib_alloc_device(size, &init_net), struct rvt_dev_info, ibdev); if (!rdi) return rdi; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 38f68d245fa6..b91a81234832 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2914,11 +2914,18 @@ struct ib_block_iter { unsigned int __pg_bit; /* alignment of current block */ }; -struct ib_device *_ib_alloc_device(size_t size); +struct ib_device *_ib_alloc_device(size_t size, struct net *net); #define ib_alloc_device(drv_struct, member) \ container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ BUILD_BUG_ON_ZERO(offsetof( \ - struct drv_struct, member))), \ + struct drv_struct, member)), \ + &init_net), \ + struct drv_struct, member) + +#define ib_alloc_device_with_net(drv_struct, member, net) \ + container_of(_ib_alloc_device(sizeof(struct drv_struct) + \ + BUILD_BUG_ON_ZERO(offsetof( \ + struct drv_struct, member)), net), \ struct drv_struct, member) void ib_dealloc_device(struct ib_device *device); -- cgit v1.2.3 From 611d08207d313500d010d8792424346ce70d0cfb Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 17 Jun 2025 11:44:02 +0300 Subject: RDMA/mlx5: Allocate IB device with net namespace supplied from core dev Use the new ib_alloc_device_with_net() API to allocate the IB device so that it is properly bound to the network namespace obtained via mlx5_core_net(). This change ensures correct namespace association (e.g., for containerized setups). Additionally, expose mlx5_core_net so that RDMA driver can use it. Signed-off-by: Shay Drory Signed-off-by: Mark Bloch Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/mlx5/ib_rep.c | 3 ++- drivers/infiniband/hw/mlx5/main.c | 6 ++++-- drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h | 5 ----- include/linux/mlx5/driver.h | 5 +++++ 4 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 49af1cfbe6d1..cc8859d3c2f5 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -88,7 +88,8 @@ mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep) else return mlx5_ib_set_vport_rep(lag_master, rep, vport_index); - ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev); + ibdev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(lag_master)); if (!ibdev) return -ENOMEM; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index efea3ffd9715..c521bce2eeff 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4793,7 +4793,8 @@ static struct ib_device *mlx5_ib_add_sub_dev(struct ib_device *parent, !MLX5_CAP_GEN_2(mparent->mdev, multiplane_qp_ud)) return ERR_PTR(-EOPNOTSUPP); - mplane = ib_alloc_device(mlx5_ib_dev, ib_dev); + mplane = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mparent->mdev)); if (!mplane) return ERR_PTR(-ENOMEM); @@ -4907,7 +4908,8 @@ static int mlx5r_probe(struct auxiliary_device *adev, num_ports = max(MLX5_CAP_GEN(mdev, num_ports), MLX5_CAP_GEN(mdev, num_vhca_ports)); - dev = ib_alloc_device(mlx5_ib_dev, ib_dev); + dev = ib_alloc_device_with_net(mlx5_ib_dev, ib_dev, + mlx5_core_net(mdev)); if (!dev) return -ENOMEM; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h index 37d5f445598c..b111ccd03b02 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h @@ -45,11 +45,6 @@ int mlx5_crdump_enable(struct mlx5_core_dev *dev); void mlx5_crdump_disable(struct mlx5_core_dev *dev); int mlx5_crdump_collect(struct mlx5_core_dev *dev, u32 *cr_data); -static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) -{ - return devlink_net(priv_to_devlink(dev)); -} - static inline struct net_device *mlx5_uplink_netdev_get(struct mlx5_core_dev *mdev) { return mdev->mlx5e_res.uplink_netdev; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e6ba8f4f4bd1..3475d33c75f4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1349,4 +1349,9 @@ enum { }; bool mlx5_wc_support_get(struct mlx5_core_dev *mdev); + +static inline struct net *mlx5_core_net(struct mlx5_core_dev *dev) +{ + return devlink_net(priv_to_devlink(dev)); +} #endif /* MLX5_DRIVER_H */ -- cgit v1.2.3 From f1208b05574f63c52e88109d8c75afdf4fc6bf42 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 17 Jun 2025 11:44:03 +0300 Subject: RDMA/ipoib: Use parent rdma device net namespace Use the net namespace of the underlying rdma device. After honoring the rdma device's namespace, the ipoib netdev now also runs in the same net namespace of the rdma device. Add an API to read the net namespace of the rdma device so that ULP such as IPoIB can use it to initialize its netdev. Signed-off-by: Parav Pandit Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++ include/rdma/ib_verbs.h | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include') diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index f2f5465f2a90..7acafc5c0e09 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2577,6 +2577,8 @@ static struct net_device *ipoib_add_port(const char *format, ndev->rtnl_link_ops = ipoib_get_link_ops(); + dev_net_set(ndev, rdma_dev_net(hca)); + result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b91a81234832..7da27f01eeb6 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4871,6 +4871,11 @@ static inline int ibdev_to_node(struct ib_device *ibdev) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); +static inline struct net *rdma_dev_net(struct ib_device *device) +{ + return read_pnet(&device->coredev.rdma_net); +} + #define IB_ROCE_UDP_ENCAP_VALID_PORT_MIN (0xC000) #define IB_ROCE_UDP_ENCAP_VALID_PORT_MAX (0xFFFF) #define IB_GRH_FLOWLABEL_MASK (0x000FFFFF) -- cgit v1.2.3 From f458ccd2aa2c5a6f0129a9b1548f2825071fdc6b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 26 Jun 2025 21:58:04 +0300 Subject: RDMA/uverbs: Check CAP_NET_RAW in user namespace for flow create Currently, the capability check is done in the default init_user_ns user namespace. When a process runs in a non default user namespace, such check fails. Due to this when a process is running using Podman, it fails to create the flow resource. Since the RDMA device is a resource within a network namespace, use the network namespace associated with the RDMA device to determine its owning user namespace. Fixes: 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") Signed-off-by: Parav Pandit Suggested-by: Eric W. Biederman Link: https://patch.msgid.link/6df6f2f24627874c4f6d041c19dc1f6f29f68f84.1750963874.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 27 +++++++++++++++++++++++++++ drivers/infiniband/core/rdma_core.c | 29 +++++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_cmd.c | 2 +- include/rdma/ib_verbs.h | 3 +++ 4 files changed, 60 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index c0f8b8cba7c0..1ca6a9b7ba1a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -145,6 +145,33 @@ bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) } EXPORT_SYMBOL(rdma_dev_access_netns); +/** + * rdma_dev_has_raw_cap() - Returns whether a specified rdma device has + * CAP_NET_RAW capability or not. + * + * @dev: Pointer to rdma device whose capability to be checked + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. When rdma subsystem is in legacy shared network, + * namespace mode, the default net namespace is considered. + */ +bool rdma_dev_has_raw_cap(const struct ib_device *dev) +{ + const struct net *net; + + /* Network namespace is the resource whose user namespace + * to be considered. When in shared mode, there is no reliable + * network namespace resource, so consider the default net namespace. + */ + if (ib_devices_shared_netns) + net = &init_net; + else + net = read_pnet(&dev->coredev.rdma_net); + + return ns_capable(net->user_ns, CAP_NET_RAW); +} +EXPORT_SYMBOL(rdma_dev_has_raw_cap); + /* * xarray has this behavior where it won't iterate over NULL values stored in * allocated arrays. So we need our own iterator to see all values stored in diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 90c177edf9b0..18918f463361 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -1019,3 +1019,32 @@ void uverbs_finalize_object(struct ib_uobject *uobj, WARN_ON(true); } } + +/** + * rdma_uattrs_has_raw_cap() - Returns whether a rdma device linked to the + * uverbs attributes file has CAP_NET_RAW + * capability or not. + * + * @attrs: Pointer to uverbs attributes + * + * Returns true if a rdma device's owning user namespace has CAP_NET_RAW + * capability, otherwise false. + */ +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + struct ib_uverbs_file *ufile = attrs->ufile; + struct ib_ucontext *ucontext; + bool has_cap = false; + int srcu_key; + + srcu_key = srcu_read_lock(&ufile->device->disassociate_srcu); + ucontext = ib_uverbs_get_ucontext_file(ufile); + if (IS_ERR(ucontext)) + goto out; + has_cap = rdma_dev_has_raw_cap(ucontext->device); + +out: + srcu_read_unlock(&ufile->device->disassociate_srcu, srcu_key); + return has_cap; +} +EXPORT_SYMBOL(rdma_uattrs_has_raw_cap); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index bc9fe3ceca4d..6700c2c66167 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -3225,7 +3225,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) if (cmd.comp_mask) return -EINVAL; - if (!capable(CAP_NET_RAW)) + if (!rdma_uattrs_has_raw_cap(attrs)) return -EPERM; if (cmd.flow_attr.flags >= IB_FLOW_ATTR_FLAGS_RESERVED) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7da27f01eeb6..010594dc755b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4817,6 +4817,8 @@ static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) } #endif +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); + struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, @@ -4871,6 +4873,7 @@ static inline int ibdev_to_node(struct ib_device *ibdev) bool rdma_dev_access_netns(const struct ib_device *device, const struct net *net); +bool rdma_dev_has_raw_cap(const struct ib_device *dev); static inline struct net *rdma_dev_net(struct ib_device *device) { return read_pnet(&device->coredev.rdma_net); -- cgit v1.2.3 From 98269398c02ab20eb9ed6d77416023a2627049d8 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 8 Jul 2025 12:31:39 +0300 Subject: RDMA/uverbs: Add empty rdma_uattrs_has_raw_cap() declaration The call to rdma_uattrs_has_raw_cap() is placed in mlx5 fs.c file, which is compiled without relation to CONFIG_INFINIBAND_USER_ACCESS. Despite the check is used only in flows with CONFIG_INFINIBAND_USER_ACCESS=y|m, the compilers generate the following error for CONFIG_INFINIBAND_USER_ACCESS=n builds. >> ERROR: modpost: "rdma_uattrs_has_raw_cap" [drivers/infiniband/hw/mlx5/mlx5_ib.ko] undefined! Fixes: f458ccd2aa2c ("RDMA/uverbs: Check CAP_NET_RAW in user namespace for flow create") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202507080725.bh7xrhpg-lkp@intel.com/ Link: https://patch.msgid.link/72dee6b379bd709255a5d8e8010b576d50e47170.1751967071.git.leon@kernel.org Reviewed-by: Zhu Yanjun Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky --- include/rdma/ib_verbs.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 010594dc755b..1d123812a1f9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4810,15 +4810,19 @@ struct ib_ucontext *ib_uverbs_get_ucontext_file(struct ib_uverbs_file *ufile); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs); +bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); #else static inline int uverbs_destroy_def_handler(struct uverbs_attr_bundle *attrs) { return 0; } +static inline bool +rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs) +{ + return false; +} #endif -bool rdma_uattrs_has_raw_cap(const struct uverbs_attr_bundle *attrs); - struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, -- cgit v1.2.3 From 1a40c362ae265ca4004f7373b34c22af6810f6cb Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Tue, 8 Jul 2025 20:23:06 +0000 Subject: RDMA/uverbs: Add a common way to create CQ with umem Add ioctl command attributes and a common handling for the option to create CQs with memory buffers passed from userspace. When required attributes are supplied, create umem and provide it for driver's use. The extension enables creation of CQs on top of preallocated CPU virtual or device memory buffers, by supplying VA or dmabuf fd, in a common way. Drivers can support this flow by initializing a new create_cq_umem fp field in their ops struct, with a function that can handle the new parameter. Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20250708202308.24783-2-mrgolin@amazon.com Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/device.c | 1 + drivers/infiniband/core/uverbs_std_types_cq.c | 87 +++++++++++++++++++++++++-- include/rdma/ib_verbs.h | 4 ++ include/uapi/rdma/ib_user_ioctl_cmds.h | 4 ++ 4 files changed, 90 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 1ca6a9b7ba1a..f301cdce1728 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2728,6 +2728,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_ah); SET_DEVICE_OP(dev_ops, create_counters); SET_DEVICE_OP(dev_ops, create_cq); + SET_DEVICE_OP(dev_ops, create_cq_umem); SET_DEVICE_OP(dev_ops, create_flow); SET_DEVICE_OP(dev_ops, create_qp); SET_DEVICE_OP(dev_ops, create_rwq_ind_table); diff --git a/drivers/infiniband/core/uverbs_std_types_cq.c b/drivers/infiniband/core/uverbs_std_types_cq.c index 432054f0a8a4..37cd37556510 100644 --- a/drivers/infiniband/core/uverbs_std_types_cq.c +++ b/drivers/infiniband/core/uverbs_std_types_cq.c @@ -64,15 +64,21 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( struct ib_ucq_object *obj = container_of( uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE), typeof(*obj), uevent.uobject); + struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_device *ib_dev = attrs->context->device; - int ret; - u64 user_handle; + struct ib_umem_dmabuf *umem_dmabuf; struct ib_cq_init_attr attr = {}; - struct ib_cq *cq; - struct ib_uverbs_completion_event_file *ev_file = NULL; struct ib_uobject *ev_file_uobj; + struct ib_umem *umem = NULL; + u64 buffer_length; + u64 buffer_offset; + struct ib_cq *cq; + u64 user_handle; + u64 buffer_va; + int buffer_fd; + int ret; - if (!ib_dev->ops.create_cq || !ib_dev->ops.destroy_cq) + if ((!ib_dev->ops.create_cq && !ib_dev->ops.create_cq_umem) || !ib_dev->ops.destroy_cq) return -EOPNOTSUPP; ret = uverbs_copy_from(&attr.comp_vector, attrs, @@ -112,9 +118,66 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( INIT_LIST_HEAD(&obj->comp_list); INIT_LIST_HEAD(&obj->uevent.event_list); + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA)) { + + ret = uverbs_copy_from(&buffer_va, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem = ib_umem_get(ib_dev, buffer_va, buffer_length, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem)) { + ret = PTR_ERR(umem); + goto err_event_file; + } + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD)) { + + ret = uverbs_get_raw_fd(&buffer_fd, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_FD); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_offset, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET); + if (ret) + goto err_event_file; + + ret = uverbs_copy_from(&buffer_length, attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH); + if (ret) + goto err_event_file; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_VA) || + !ib_dev->ops.create_cq_umem) { + ret = -EINVAL; + goto err_event_file; + } + + umem_dmabuf = ib_umem_dmabuf_get_pinned(ib_dev, buffer_offset, buffer_length, + buffer_fd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(umem_dmabuf)) { + ret = PTR_ERR(umem_dmabuf); + goto err_event_file; + } + umem = &umem_dmabuf->umem; + } else if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET) || + uverbs_attr_is_valid(attrs, UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH) || + !ib_dev->ops.create_cq) { + ret = -EINVAL; + goto err_event_file; + } + cq = rdma_zalloc_drv_obj(ib_dev, ib_cq); if (!cq) { ret = -ENOMEM; + ib_umem_release(umem); goto err_event_file; } @@ -128,7 +191,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)( rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, NULL); - ret = ib_dev->ops.create_cq(cq, &attr, attrs); + ret = umem ? ib_dev->ops.create_cq_umem(cq, &attr, umem, attrs) : + ib_dev->ops.create_cq(cq, &attr, attrs); if (ret) goto err_free; @@ -180,6 +244,17 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_ASYNC_EVENT, UVERBS_ACCESS_READ, UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_VA, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_CREATE_CQ_BUFFER_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), UVERBS_ATTR_UHW()); static int UVERBS_HANDLER(UVERBS_METHOD_CQ_DESTROY)( diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 1d123812a1f9..3fb1c963eeb0 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2486,6 +2486,10 @@ struct ib_device_ops { int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata); int (*create_cq)(struct ib_cq *cq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); + int (*create_cq_umem)(struct ib_cq *cq, + const struct ib_cq_init_attr *attr, + struct ib_umem *umem, + struct uverbs_attr_bundle *attrs); int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period); int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata); int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata); diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index ac7b162611ed..5f3e5bee51b2 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -105,6 +105,10 @@ enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_CREATE_CQ_RESP_CQE, UVERBS_ATTR_CREATE_CQ_EVENT_FD, + UVERBS_ATTR_CREATE_CQ_BUFFER_VA, + UVERBS_ATTR_CREATE_CQ_BUFFER_LENGTH, + UVERBS_ATTR_CREATE_CQ_BUFFER_FD, + UVERBS_ATTR_CREATE_CQ_BUFFER_OFFSET, }; enum uverbs_attrs_destroy_cq_cmd_attr_ids { -- cgit v1.2.3 From c897c2c8b8e82981df10df546c753ac857612937 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Tue, 8 Jul 2025 20:23:07 +0000 Subject: RDMA/core: Add umem "is_contiguous" and "start_dma_addr" helpers In some cases drivers may need to check if a given umem is contiguous. Add a helper function in core code so that drivers don't need to deal with umem or scatter-gather lists structure. Additionally add a helper for getting umem's start DMA address and use it in other helper functions that open code it. Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20250708202308.24783-3-mrgolin@amazon.com Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- include/rdma/ib_umem.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 7dc7b1cc71b5..0a8e092c0ea8 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -52,11 +52,15 @@ static inline int ib_umem_offset(struct ib_umem *umem) return umem->address & ~PAGE_MASK; } +static inline dma_addr_t ib_umem_start_dma_addr(struct ib_umem *umem) +{ + return sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem); +} + static inline unsigned long ib_umem_dma_offset(struct ib_umem *umem, unsigned long pgsz) { - return (sg_dma_address(umem->sgt_append.sgt.sgl) + ib_umem_offset(umem)) & - (pgsz - 1); + return ib_umem_start_dma_addr(umem) & (pgsz - 1); } static inline size_t ib_umem_num_dma_blocks(struct ib_umem *umem, @@ -135,14 +139,27 @@ static inline unsigned long ib_umem_find_best_pgoff(struct ib_umem *umem, unsigned long pgsz_bitmap, u64 pgoff_bitmask) { - struct scatterlist *sg = umem->sgt_append.sgt.sgl; dma_addr_t dma_addr; - dma_addr = sg_dma_address(sg) + (umem->address & ~PAGE_MASK); + dma_addr = ib_umem_start_dma_addr(umem); return ib_umem_find_best_pgsz(umem, pgsz_bitmap, dma_addr & pgoff_bitmask); } +static inline bool ib_umem_is_contiguous(struct ib_umem *umem) +{ + dma_addr_t dma_addr; + unsigned long pgsz; + + /* + * Select the smallest aligned page that can contain the whole umem if + * it was contiguous. + */ + dma_addr = ib_umem_start_dma_addr(umem); + pgsz = roundup_pow_of_two((dma_addr ^ (umem->length - 1 + dma_addr)) + 1); + return !!ib_umem_find_best_pgoff(umem, pgsz, U64_MAX); +} + struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device, unsigned long offset, size_t size, int fd, int access, -- cgit v1.2.3 From 9fb3dd85197f5e5901a81b104a0f8b513148d138 Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Tue, 8 Jul 2025 20:23:08 +0000 Subject: RDMA/efa: Add CQ with external memory support Add an option to create CQ using external memory instead of allocating in the driver. The memory can be passed from userspace by dmabuf fd and an offset or a VA. One of the possible usages is creating CQs that reside in accelerator memory, allowing low latency asynchronous direct polling from the accelerator device. Add a capability bit to reflect on the feature support. Reviewed-by: Daniel Kranzdorf Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20250708202308.24783-4-mrgolin@amazon.com Reviewed-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/hw/efa/efa.h | 3 ++ drivers/infiniband/hw/efa/efa_main.c | 1 + drivers/infiniband/hw/efa/efa_verbs.c | 61 +++++++++++++++++++++++++++-------- include/uapi/rdma/efa-abi.h | 3 +- 4 files changed, 53 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 838182d0409c..3d49c1db928e 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -107,6 +107,7 @@ struct efa_cq { u16 cq_idx; /* NULL when no interrupts requested */ struct efa_eq *eq; + struct ib_umem *umem; }; struct efa_qp { @@ -162,6 +163,8 @@ int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr, int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs); +int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_umem *umem, struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c index 4f03c0ec819f..6c415b9adb5f 100644 --- a/drivers/infiniband/hw/efa/efa_main.c +++ b/drivers/infiniband/hw/efa/efa_main.c @@ -372,6 +372,7 @@ static const struct ib_device_ops efa_dev_ops = { .alloc_pd = efa_alloc_pd, .alloc_ucontext = efa_alloc_ucontext, .create_cq = efa_create_cq, + .create_cq_umem = efa_create_cq_umem, .create_qp = efa_create_qp, .create_user_ah = efa_create_ah, .dealloc_pd = efa_dealloc_pd, diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 7c708029b4b4..0f68aec12883 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -254,6 +254,7 @@ int efa_query_device(struct ib_device *ibdev, resp.max_rdma_size = dev_attr->max_rdma_size; resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID; + resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_EXT_MEM; if (EFA_DEV_CAP(dev, RDMA_READ)) resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ; @@ -1087,8 +1088,11 @@ int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) xa_erase(&dev->cqs_xa, cq->cq_idx); synchronize_irq(cq->eq->irq.irqn); } - efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, - DMA_FROM_DEVICE); + + if (cq->umem) + ib_umem_release(cq->umem); + else + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE); return 0; } @@ -1127,8 +1131,8 @@ static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq, return 0; } -int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, - struct uverbs_attr_bundle *attrs) +int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct ib_umem *umem, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct efa_ucontext *ucontext = rdma_udata_to_drv_context( @@ -1207,11 +1211,30 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ucontext = ucontext; cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs); - cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, - DMA_FROM_DEVICE); - if (!cq->cpu_addr) { - err = -ENOMEM; - goto err_out; + + if (umem) { + if (umem->length < cq->size) { + ibdev_dbg(&dev->ibdev, "External memory too small\n"); + err = -EINVAL; + goto err_free_mem; + } + + if (!ib_umem_is_contiguous(umem)) { + ibdev_dbg(&dev->ibdev, "Non contiguous CQ unsupported\n"); + err = -EINVAL; + goto err_free_mem; + } + + cq->cpu_addr = NULL; + cq->dma_addr = ib_umem_start_dma_addr(umem); + cq->umem = umem; + } else { + cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size, + DMA_FROM_DEVICE); + if (!cq->cpu_addr) { + err = -ENOMEM; + goto err_out; + } } params.uarn = cq->ucontext->uarn; @@ -1228,7 +1251,7 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, err = efa_com_create_cq(&dev->edev, ¶ms, &result); if (err) - goto err_free_mapped; + goto err_free_mem; resp.db_off = result.db_off; resp.cq_idx = result.cq_idx; @@ -1236,7 +1259,9 @@ int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, cq->ibcq.cqe = result.actual_depth; WARN_ON_ONCE(entries != result.actual_depth); - err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid); + if (!umem) + err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid); + if (err) { ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n", cq->cq_idx); @@ -1274,15 +1299,23 @@ err_remove_mmap: efa_cq_user_mmap_entries_remove(cq); err_destroy_cq: efa_destroy_cq_idx(dev, cq->cq_idx); -err_free_mapped: - efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, - DMA_FROM_DEVICE); +err_free_mem: + if (umem) + ib_umem_release(umem); + else + efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size, DMA_FROM_DEVICE); err_out: atomic64_inc(&dev->stats.create_cq_err); return err; } +int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, + struct uverbs_attr_bundle *attrs) +{ + return efa_create_cq_umem(ibcq, attr, NULL, attrs); +} + static int umem_to_page_list(struct efa_dev *dev, struct ib_umem *umem, u64 *page_list, diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h index 11b94b0b035b..98b71b9979f8 100644 --- a/include/uapi/rdma/efa-abi.h +++ b/include/uapi/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2024 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -131,6 +131,7 @@ enum { EFA_QUERY_DEVICE_CAPS_DATA_POLLING_128 = 1 << 4, EFA_QUERY_DEVICE_CAPS_RDMA_WRITE = 1 << 5, EFA_QUERY_DEVICE_CAPS_UNSOLICITED_WRITE_RECV = 1 << 6, + EFA_QUERY_DEVICE_CAPS_CQ_WITH_EXT_MEM = 1 << 7, }; struct efa_ibv_ex_query_device_resp { -- cgit v1.2.3 From 0a61ec9cc51b0e43981222005444508437e95b33 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:25 +0300 Subject: PCI/TPH: Expose pcie_tph_get_st_table_size() Expose pcie_tph_get_st_table_size() to be used by drivers as will be done in the next patch from the series. Signed-off-by: Yishai Hadas Acked-by: Bjorn Helgaas Link: https://patch.msgid.link/9ae851e0ee42cc56d2a30276e116b65091030ceb.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/pci/tph.c | 11 ++++++----- include/linux/pci-tph.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/pci/tph.c b/drivers/pci/tph.c index 77fce5e1b830..cc64f93709a4 100644 --- a/drivers/pci/tph.c +++ b/drivers/pci/tph.c @@ -168,7 +168,7 @@ static u32 get_st_table_loc(struct pci_dev *pdev) * Return the size of ST table. If ST table is not in TPH Requester Extended * Capability space, return 0. Otherwise return the ST Table Size + 1. */ -static u16 get_st_table_size(struct pci_dev *pdev) +u16 pcie_tph_get_st_table_size(struct pci_dev *pdev) { u32 reg; u32 loc; @@ -185,6 +185,7 @@ static u16 get_st_table_size(struct pci_dev *pdev) return FIELD_GET(PCI_TPH_CAP_ST_MASK, reg) + 1; } +EXPORT_SYMBOL(pcie_tph_get_st_table_size); /* Return device's Root Port completer capability */ static u8 get_rp_completer_type(struct pci_dev *pdev) @@ -211,7 +212,7 @@ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag) int offset; /* Check if index is out of bound */ - st_table_size = get_st_table_size(pdev); + st_table_size = pcie_tph_get_st_table_size(pdev); if (index >= st_table_size) return -ENXIO; @@ -443,7 +444,7 @@ void pci_restore_tph_state(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++); st_entry = (u16 *)cap; offset = PCI_TPH_BASE_SIZEOF; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); for (i = 0; i < num_entries; i++) { pci_write_config_word(pdev, pdev->tph_cap + offset, *st_entry++); @@ -475,7 +476,7 @@ void pci_save_tph_state(struct pci_dev *pdev) /* Save all ST entries in extended capability structure */ st_entry = (u16 *)cap; offset = PCI_TPH_BASE_SIZEOF; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); for (i = 0; i < num_entries; i++) { pci_read_config_word(pdev, pdev->tph_cap + offset, st_entry++); @@ -499,7 +500,7 @@ void pci_tph_init(struct pci_dev *pdev) if (!pdev->tph_cap) return; - num_entries = get_st_table_size(pdev); + num_entries = pcie_tph_get_st_table_size(pdev); save_size = sizeof(u32) + num_entries * sizeof(u16); pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size); } diff --git a/include/linux/pci-tph.h b/include/linux/pci-tph.h index c3e806c13d64..9e4e331b1603 100644 --- a/include/linux/pci-tph.h +++ b/include/linux/pci-tph.h @@ -28,6 +28,7 @@ int pcie_tph_get_cpu_st(struct pci_dev *dev, unsigned int cpu_uid, u16 *tag); void pcie_disable_tph(struct pci_dev *pdev); int pcie_enable_tph(struct pci_dev *pdev, int mode); +u16 pcie_tph_get_st_table_size(struct pci_dev *pdev); #else static inline int pcie_tph_set_st_entry(struct pci_dev *pdev, unsigned int index, u16 tag) -- cgit v1.2.3 From 5f9ec7880e6b3c4d0cf242fe28506d0b084328b1 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:26 +0300 Subject: net/mlx5: Expose IFC bits for TPH Expose IFC bits for the TPH functionality. Signed-off-by: Yishai Hadas Reviewed-by: Edward Srouji Reviewed-by: Moshe Shemesh Link: https://patch.msgid.link/38ea3a0d56551364214e8edf359c9c77c9a3b71b.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- include/linux/mlx5/mlx5_ifc.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index ed4130e49c27..8360d9011d4f 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -1871,7 +1871,9 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_at_280[0x10]; u8 max_wqe_sz_sq[0x10]; - u8 reserved_at_2a0[0xb]; + u8 reserved_at_2a0[0x7]; + u8 mkey_pcie_tph[0x1]; + u8 reserved_at_2a8[0x3]; u8 shampo[0x1]; u8 reserved_at_2ac[0x4]; u8 max_wqe_sz_rq[0x10]; @@ -4418,6 +4420,10 @@ enum { MLX5_MKC_ACCESS_MODE_CROSSING = 0x6, }; +enum { + MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX = 0, +}; + struct mlx5_ifc_mkc_bits { u8 reserved_at_0[0x1]; u8 free[0x1]; @@ -4469,7 +4475,11 @@ struct mlx5_ifc_mkc_bits { u8 relaxed_ordering_read[0x1]; u8 log_page_size[0x6]; - u8 reserved_at_1e0[0x20]; + u8 reserved_at_1e0[0x5]; + u8 pcie_tph_en[0x1]; + u8 pcie_tph_ph[0x2]; + u8 pcie_tph_steering_tag_index[0x8]; + u8 reserved_at_1f0[0x10]; }; struct mlx5_ifc_pkey_bits { -- cgit v1.2.3 From 888a7776f4fb04c19bec70c737c61c2f383c6b1e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:27 +0300 Subject: net/mlx5: Add support for device steering tag Background, from PCIe specification 6.2. TLP Processing Hints (TPH) -------------------------- TLP Processing Hints is an optional feature that provides hints in Request TLP headers to facilitate optimized processing of Requests that target Memory Space. These Processing Hints enable the system hardware (e.g., the Root Complex and/or Endpoints) to optimize platform resources such as system and memory interconnect on a per TLP basis. Steering Tags are system-specific values used to identify a processing resource that a Requester explicitly targets. System software discovers and identifies TPH capabilities to determine the Steering Tag allocation for each Function that supports TPH. This patch adds steering tag support for mlx5 based NICs by: - Enabling the TPH functionality over PCI if both FW and OS support it. - Managing steering tags and their matching steering indexes by writing a ST to an ST index over the PCI configuration space. - Exposing APIs to upper layers (e.g.,mlx5_ib) to allow usage of the PCI TPH infrastructure. Further details: - Upon probing of a device, the feature will be enabled based on both capability detection and OS support. - It will retrieve the appropriate ST for a given CPU ID and memory type using the pcie_tph_get_cpu_st() API. - It will track available ST indices according to the configuration space table size (expected to be 63 entries), reserving index 0 to indicate non-TPH use. - It will assign a free ST index with a ST using the pcie_tph_set_st_entry() API. - It will reuse the same index for identical (CPU ID + memory type) combinations by maintaining a reference count per entry. - It will expose APIs to upper layers (e.g., mlx5_ib) to allow usage of the PCI TPH infrastructure. - SF will use its parent PF stuff. Signed-off-by: Yishai Hadas Link: https://patch.msgid.link/de1ae7398e9e34eacd8c10845683df44fc9e32f8.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 5 + drivers/net/ethernet/mellanox/mlx5/core/lib/st.c | 164 +++++++++++++++++++++ drivers/net/ethernet/mellanox/mlx5/core/main.c | 2 + .../net/ethernet/mellanox/mlx5/core/mlx5_core.h | 9 ++ include/linux/mlx5/driver.h | 20 +++ 5 files changed, 200 insertions(+) create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/lib/st.c (limited to 'include') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index d292e6a9e22c..bd9d46c6719f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -167,5 +167,10 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_ # mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o +# +# TPH support +# +mlx5_core-$(CONFIG_PCIE_TPH) += lib/st.o + obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o mlx5_dpll-y := dpll.o diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c new file mode 100644 index 000000000000..47fe215f66bf --- /dev/null +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/st.c @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include + +#include "mlx5_core.h" +#include "lib/mlx5.h" + +struct mlx5_st_idx_data { + refcount_t usecount; + u16 tag; +}; + +struct mlx5_st { + /* serialize access upon alloc/free flows */ + struct mutex lock; + struct xa_limit index_limit; + struct xarray idx_xa; /* key == index, value == struct mlx5_st_idx_data */ +}; + +struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev) +{ + struct pci_dev *pdev = dev->pdev; + struct mlx5_st *st; + u16 num_entries; + int ret; + + if (!MLX5_CAP_GEN(dev, mkey_pcie_tph)) + return NULL; + +#ifdef CONFIG_MLX5_SF + if (mlx5_core_is_sf(dev)) + return dev->priv.parent_mdev->st; +#endif + + /* Checking whether the device is capable */ + if (!pdev->tph_cap) + return NULL; + + num_entries = pcie_tph_get_st_table_size(pdev); + /* We need a reserved entry for non TPH cases */ + if (num_entries < 2) + return NULL; + + /* The OS doesn't support ST */ + ret = pcie_enable_tph(pdev, PCI_TPH_ST_DS_MODE); + if (ret) + return NULL; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (!st) + goto end; + + mutex_init(&st->lock); + xa_init_flags(&st->idx_xa, XA_FLAGS_ALLOC); + /* entry 0 is reserved for non TPH cases */ + st->index_limit.min = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX + 1; + st->index_limit.max = num_entries - 1; + + return st; + +end: + pcie_disable_tph(dev->pdev); + return NULL; +} + +void mlx5_st_destroy(struct mlx5_core_dev *dev) +{ + struct mlx5_st *st = dev->st; + + if (mlx5_core_is_sf(dev) || !st) + return; + + pcie_disable_tph(dev->pdev); + WARN_ON_ONCE(!xa_empty(&st->idx_xa)); + kfree(st); +} + +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + unsigned long index; + u32 xa_id; + u16 tag; + int ret; + + if (!st) + return -EOPNOTSUPP; + + ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag); + if (ret) + return ret; + + mutex_lock(&st->lock); + + xa_for_each(&st->idx_xa, index, idx_data) { + if (tag == idx_data->tag) { + refcount_inc(&idx_data->usecount); + *st_index = index; + goto end; + } + } + + idx_data = kzalloc(sizeof(*idx_data), GFP_KERNEL); + if (!idx_data) { + ret = -ENOMEM; + goto end; + } + + refcount_set(&idx_data->usecount, 1); + idx_data->tag = tag; + + ret = xa_alloc(&st->idx_xa, &xa_id, idx_data, st->index_limit, GFP_KERNEL); + if (ret) + goto clean_idx_data; + + ret = pcie_tph_set_st_entry(dev->pdev, xa_id, tag); + if (ret) + goto clean_idx_xa; + + *st_index = xa_id; + goto end; + +clean_idx_xa: + xa_erase(&st->idx_xa, xa_id); +clean_idx_data: + kfree(idx_data); +end: + mutex_unlock(&st->lock); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_alloc_index); + +int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) +{ + struct mlx5_st_idx_data *idx_data; + struct mlx5_st *st = dev->st; + int ret = 0; + + if (!st) + return -EOPNOTSUPP; + + mutex_lock(&st->lock); + idx_data = xa_load(&st->idx_xa, st_index); + if (WARN_ON_ONCE(!idx_data)) { + ret = -EINVAL; + goto end; + } + + if (refcount_dec_and_test(&idx_data->usecount)) { + xa_erase(&st->idx_xa, st_index); + /* We leave PCI config space as was before, no mkey will refer to it */ + } + +end: + mutex_unlock(&st->lock); + return ret; +} +EXPORT_SYMBOL_GPL(mlx5_st_dealloc_index); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index b0043cfee29b..be3be043134f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1102,6 +1102,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev) } dev->dm = mlx5_dm_create(dev); + dev->st = mlx5_st_create(dev); dev->tracer = mlx5_fw_tracer_create(dev); dev->hv_vhca = mlx5_hv_vhca_create(dev); dev->rsc_dump = mlx5_rsc_dump_create(dev); @@ -1150,6 +1151,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev) mlx5_rsc_dump_destroy(dev); mlx5_hv_vhca_destroy(dev->hv_vhca); mlx5_fw_tracer_destroy(dev->tracer); + mlx5_st_destroy(dev); mlx5_dm_cleanup(dev); mlx5_fs_core_free(dev); mlx5_sf_table_cleanup(dev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 2e02bdea8361..1cada2f87acf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -300,6 +300,15 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode); struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev); void mlx5_dm_cleanup(struct mlx5_core_dev *dev); +#ifdef CONFIG_PCIE_TPH +struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev); +void mlx5_st_destroy(struct mlx5_core_dev *dev); +#else +static inline struct mlx5_st * +mlx5_st_create(struct mlx5_core_dev *dev) { return NULL; } +static inline void mlx5_st_destroy(struct mlx5_core_dev *dev) { return; } +#endif + void mlx5_toggle_port_link(struct mlx5_core_dev *dev); int mlx5_set_port_admin_status(struct mlx5_core_dev *dev, enum mlx5_port_status status); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index e6ba8f4f4bd1..104d4921c032 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -688,6 +689,7 @@ struct mlx5_fw_tracer; struct mlx5_vxlan; struct mlx5_geneve; struct mlx5_hv_vhca; +struct mlx5_st; #define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity)) #define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)) @@ -757,6 +759,7 @@ struct mlx5_core_dev { u32 issi; struct mlx5e_resources mlx5e_res; struct mlx5_dm *dm; + struct mlx5_st *st; struct mlx5_vxlan *vxlan; struct mlx5_geneve *geneve; struct { @@ -1160,6 +1163,23 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type, u64 length, u16 uid, phys_addr_t addr, u32 obj_id); +#ifdef CONFIG_PCIE_TPH +int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index); +int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index); +#else +static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev, + enum tph_mem_type mem_type, + unsigned int cpu_uid, u16 *st_index) +{ + return -EOPNOTSUPP; +} +static inline int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index) +{ + return -EOPNOTSUPP; +} +#endif + struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev); void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev); -- cgit v1.2.3 From 5b2e45049dc06a876bc6b138218ddeb0814502ef Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:28 +0300 Subject: IB/core: Add UVERBS_METHOD_REG_MR on the MR object This new method enables us to use a single ioctl from user space which supports the below variants of reg_mr [1]. The method will be extended in the next patches from the series with an extra attribute to let us pass DMA handle to be used as part of the registration. [1] ibv_reg_mr(), ibv_reg_mr_iova(), ibv_reg_mr_iova2(), ibv_reg_dmabuf_mr(). Signed-off-by: Yishai Hadas Reviewed-by: Edward Srouji Link: https://patch.msgid.link/5a3822ceef084efe967c9752e89c58d8250337c7.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_std_types_mr.c | 153 +++++++++++++++++++++++++- include/uapi/rdma/ib_user_ioctl_cmds.h | 14 +++ 2 files changed, 166 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 7ebc7bd3caae..1bd4b17b5515 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -266,6 +266,122 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; } +static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get_uobject(attrs, UVERBS_ATTR_REG_MR_HANDLE); + struct ib_pd *pd = + uverbs_attr_get_obj(attrs, UVERBS_ATTR_REG_MR_PD_HANDLE); + u32 valid_access_flags = IB_ACCESS_SUPPORTED; + u64 length, iova, fd_offset = 0, addr = 0; + struct ib_device *ib_dev = pd->device; + bool has_fd_offset = false; + bool has_addr = false; + bool has_fd = false; + u32 access_flags; + struct ib_mr *mr; + int fd; + int ret; + + ret = uverbs_copy_from(&iova, attrs, UVERBS_ATTR_REG_MR_IOVA); + if (ret) + return ret; + + ret = uverbs_copy_from(&length, attrs, UVERBS_ATTR_REG_MR_LENGTH); + if (ret) + return ret; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_ADDR)) { + ret = uverbs_copy_from(&addr, attrs, + UVERBS_ATTR_REG_MR_ADDR); + if (ret) + return ret; + has_addr = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD_OFFSET)) { + ret = uverbs_copy_from(&fd_offset, attrs, + UVERBS_ATTR_REG_MR_FD_OFFSET); + if (ret) + return ret; + has_fd_offset = true; + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_FD)) { + ret = uverbs_get_raw_fd(&fd, attrs, + UVERBS_ATTR_REG_MR_FD); + if (ret) + return ret; + has_fd = true; + } + + if (has_fd) { + if (!ib_dev->ops.reg_user_mr_dmabuf) + return -EOPNOTSUPP; + + /* FD requires offset and can't come with addr */ + if (!has_fd_offset || has_addr) + return -EINVAL; + + if ((fd_offset & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + + valid_access_flags = IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_READ | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_ATOMIC | + IB_ACCESS_RELAXED_ORDERING; + } else { + if (!has_addr || has_fd_offset) + return -EINVAL; + + if ((addr & ~PAGE_MASK) != (iova & ~PAGE_MASK)) + return -EINVAL; + } + + ret = uverbs_get_flags32(&access_flags, attrs, + UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + valid_access_flags); + if (ret) + return ret; + + ret = ib_check_mr_access(ib_dev, access_flags); + if (ret) + return ret; + + if (has_fd) + mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, iova, + fd, access_flags, attrs); + else + mr = pd->device->ops.reg_user_mr(pd, addr, length, + iova, access_flags, NULL); + + if (IS_ERR(mr)) + return PTR_ERR(mr); + + mr->device = pd->device; + mr->pd = pd; + mr->type = IB_MR_TYPE_USER; + mr->uobject = uobj; + atomic_inc(&pd->usecnt); + rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); + rdma_restrack_set_name(&mr->res, NULL); + rdma_restrack_add(&mr->res); + uobj->object = mr; + + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_REG_MR_HANDLE); + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_LKEY, + &mr->lkey, sizeof(mr->lkey)); + if (ret) + return ret; + + ret = uverbs_copy_to(attrs, UVERBS_ATTR_REG_MR_RESP_RKEY, + &mr->rkey, sizeof(mr->rkey)); + return ret; +} + DECLARE_UVERBS_NAMED_METHOD( UVERBS_METHOD_ADVISE_MR, UVERBS_ATTR_IDR(UVERBS_ATTR_ADVISE_MR_PD_HANDLE, @@ -362,6 +478,40 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ATTR_TYPE(u32), UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_REG_MR, + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_HANDLE, + UVERBS_OBJECT_MR, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_OBJECT_PD, + UVERBS_ACCESS_READ, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_LENGTH, + UVERBS_ATTR_TYPE(u64), + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + enum ib_access_flags, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_ADDR, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_FD_OFFSET, + UVERBS_ATTR_TYPE(u64), + UA_OPTIONAL), + UVERBS_ATTR_RAW_FD(UVERBS_ATTR_REG_MR_FD, + UA_OPTIONAL), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_LKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY), + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_REG_MR_RESP_RKEY, + UVERBS_ATTR_TYPE(u32), + UA_MANDATORY)); + DECLARE_UVERBS_NAMED_METHOD_DESTROY( UVERBS_METHOD_MR_DESTROY, UVERBS_ATTR_IDR(UVERBS_ATTR_DESTROY_MR_HANDLE, @@ -376,7 +526,8 @@ DECLARE_UVERBS_NAMED_OBJECT( &UVERBS_METHOD(UVERBS_METHOD_DM_MR_REG), &UVERBS_METHOD(UVERBS_METHOD_MR_DESTROY), &UVERBS_METHOD(UVERBS_METHOD_QUERY_MR), - &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR)); + &UVERBS_METHOD(UVERBS_METHOD_REG_DMABUF_MR), + &UVERBS_METHOD(UVERBS_METHOD_REG_MR)); const struct uapi_definition uverbs_def_obj_mr[] = { UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_MR, diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 5f3e5bee51b2..ece923ab48a0 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -257,6 +257,7 @@ enum uverbs_methods_mr { UVERBS_METHOD_ADVISE_MR, UVERBS_METHOD_QUERY_MR, UVERBS_METHOD_REG_DMABUF_MR, + UVERBS_METHOD_REG_MR, }; enum uverbs_attrs_mr_destroy_ids { @@ -290,6 +291,19 @@ enum uverbs_attrs_reg_dmabuf_mr_cmd_attr_ids { UVERBS_ATTR_REG_DMABUF_MR_RESP_RKEY, }; +enum uverbs_attrs_reg_mr_cmd_attr_ids { + UVERBS_ATTR_REG_MR_HANDLE, + UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_ATTR_REG_MR_IOVA, + UVERBS_ATTR_REG_MR_ADDR, + UVERBS_ATTR_REG_MR_LENGTH, + UVERBS_ATTR_REG_MR_ACCESS_FLAGS, + UVERBS_ATTR_REG_MR_FD, + UVERBS_ATTR_REG_MR_FD_OFFSET, + UVERBS_ATTR_REG_MR_RESP_LKEY, + UVERBS_ATTR_REG_MR_RESP_RKEY, +}; + enum uverbs_attrs_create_counters_cmd_attr_ids { UVERBS_ATTR_CREATE_COUNTERS_HANDLE, }; -- cgit v1.2.3 From d83edab562a496a42720902a1d2effccd05c37c5 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:29 +0300 Subject: RDMA/core: Introduce a DMAH object and its alloc/free APIs Introduce a new DMA handle (DMAH) object along with its corresponding allocation and deallocation APIs. This DMAH object encapsulates attributes intended for use in DMA transactions. While its initial purpose is to support TPH functionality, it is designed to be extensible for future features such as DMA PCI multipath, PCI UIO configurations, PCI traffic class selection, and more. Further details: ---------------- We ensure that a caller requesting a DMA handle for a specific CPU ID is permitted to be scheduled on it. This prevent a potential security issue where a non privilege user may trigger DMA operations toward a CPU that it's not allowed to run on. We manage reference counting for the DMAH object and its consumers (e.g., memory regions) as will be detailed in subsequent patches in the series. Signed-off-by: Yishai Hadas Reviewed-by: Edward Srouji Link: https://patch.msgid.link/2cad097e849597e49d6b61e6865dba878257f371.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/Makefile | 1 + drivers/infiniband/core/device.c | 3 + drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/restrack.c | 2 + drivers/infiniband/core/uverbs_std_types_dmah.c | 145 ++++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 1 + include/rdma/ib_verbs.h | 26 +++++ include/rdma/restrack.h | 4 + include/uapi/rdma/ib_user_ioctl_cmds.h | 17 +++ 9 files changed, 200 insertions(+) create mode 100644 drivers/infiniband/core/uverbs_std_types_dmah.c (limited to 'include') diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d49ded7e95f0..f483e0c12444 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -33,6 +33,7 @@ ib_umad-y := user_mad.o ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ uverbs_std_types_cq.o \ + uverbs_std_types_dmah.o \ uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ uverbs_std_types_mr.o uverbs_std_types_counters.o \ uverbs_uapi.o uverbs_std_types_device.o \ diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index f301cdce1728..3145cb34a1d2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2708,6 +2708,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, add_sub_dev); SET_DEVICE_OP(dev_ops, advise_mr); SET_DEVICE_OP(dev_ops, alloc_dm); + SET_DEVICE_OP(dev_ops, alloc_dmah); SET_DEVICE_OP(dev_ops, alloc_hw_device_stats); SET_DEVICE_OP(dev_ops, alloc_hw_port_stats); SET_DEVICE_OP(dev_ops, alloc_mr); @@ -2736,6 +2737,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_DEVICE_OP(dev_ops, create_user_ah); SET_DEVICE_OP(dev_ops, create_wq); SET_DEVICE_OP(dev_ops, dealloc_dm); + SET_DEVICE_OP(dev_ops, dealloc_dmah); SET_DEVICE_OP(dev_ops, dealloc_driver); SET_DEVICE_OP(dev_ops, dealloc_mw); SET_DEVICE_OP(dev_ops, dealloc_pd); @@ -2833,6 +2835,7 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) SET_OBJ_SIZE(dev_ops, ib_ah); SET_OBJ_SIZE(dev_ops, ib_counters); SET_OBJ_SIZE(dev_ops, ib_cq); + SET_OBJ_SIZE(dev_ops, ib_dmah); SET_OBJ_SIZE(dev_ops, ib_mw); SET_OBJ_SIZE(dev_ops, ib_pd); SET_OBJ_SIZE(dev_ops, ib_qp); diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index 33706dad6c0f..a59b087611cb 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -156,6 +156,7 @@ extern const struct uapi_definition uverbs_def_obj_counters[]; extern const struct uapi_definition uverbs_def_obj_cq[]; extern const struct uapi_definition uverbs_def_obj_device[]; extern const struct uapi_definition uverbs_def_obj_dm[]; +extern const struct uapi_definition uverbs_def_obj_dmah[]; extern const struct uapi_definition uverbs_def_obj_flow_action[]; extern const struct uapi_definition uverbs_def_obj_intf[]; extern const struct uapi_definition uverbs_def_obj_mr[]; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3313410014cd..a7de6f403fca 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -100,6 +100,8 @@ static struct ib_device *res_to_dev(struct rdma_restrack_entry *res) return container_of(res, struct rdma_counter, res)->device; case RDMA_RESTRACK_SRQ: return container_of(res, struct ib_srq, res)->device; + case RDMA_RESTRACK_DMAH: + return container_of(res, struct ib_dmah, res)->device; default: WARN_ONCE(true, "Wrong resource tracking type %u\n", res->type); return NULL; diff --git a/drivers/infiniband/core/uverbs_std_types_dmah.c b/drivers/infiniband/core/uverbs_std_types_dmah.c new file mode 100644 index 000000000000..453ce656c6f2 --- /dev/null +++ b/drivers/infiniband/core/uverbs_std_types_dmah.c @@ -0,0 +1,145 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include "rdma_core.h" +#include "uverbs.h" +#include +#include "restrack.h" + +static int uverbs_free_dmah(struct ib_uobject *uobject, + enum rdma_remove_reason why, + struct uverbs_attr_bundle *attrs) +{ + struct ib_dmah *dmah = uobject->object; + int ret; + + if (atomic_read(&dmah->usecnt)) + return -EBUSY; + + ret = dmah->device->ops.dealloc_dmah(dmah, attrs); + if (ret) + return ret; + + rdma_restrack_del(&dmah->res); + kfree(dmah); + return 0; +} + +static int UVERBS_HANDLER(UVERBS_METHOD_DMAH_ALLOC)( + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE) + ->obj_attr.uobject; + struct ib_device *ib_dev = attrs->context->device; + struct ib_dmah *dmah; + int ret; + + dmah = rdma_zalloc_drv_obj(ib_dev, ib_dmah); + if (!dmah) + return -ENOMEM; + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_CPU_ID)) { + ret = uverbs_copy_from(&dmah->cpu_id, attrs, + UVERBS_ATTR_ALLOC_DMAH_CPU_ID); + if (ret) + goto err; + + if (!cpumask_test_cpu(dmah->cpu_id, current->cpus_ptr)) { + ret = -EPERM; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_CPU_ID_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE)) { + dmah->mem_type = uverbs_attr_get_enum_id(attrs, + UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE); + dmah->valid_fields |= BIT(IB_DMAH_MEM_TYPE_EXISTS); + } + + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_ALLOC_DMAH_PH)) { + ret = uverbs_copy_from(&dmah->ph, attrs, + UVERBS_ATTR_ALLOC_DMAH_PH); + if (ret) + goto err; + + /* Per PCIe spec 6.2-1.0, only the lowest two bits are applicable */ + if (dmah->ph & 0xFC) { + ret = -EINVAL; + goto err; + } + + dmah->valid_fields |= BIT(IB_DMAH_PH_EXISTS); + } + + dmah->device = ib_dev; + dmah->uobject = uobj; + atomic_set(&dmah->usecnt, 0); + + rdma_restrack_new(&dmah->res, RDMA_RESTRACK_DMAH); + rdma_restrack_set_name(&dmah->res, NULL); + + ret = ib_dev->ops.alloc_dmah(dmah, attrs); + if (ret) { + rdma_restrack_put(&dmah->res); + goto err; + } + + uobj->object = dmah; + rdma_restrack_add(&dmah->res); + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMAH_HANDLE); + return 0; +err: + kfree(dmah); + return ret; +} + +static const struct uverbs_attr_spec uverbs_dmah_mem_type[] = { + [TPH_MEM_TYPE_VM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, + [TPH_MEM_TYPE_PM] = { + .type = UVERBS_ATTR_TYPE_PTR_IN, + UVERBS_ATTR_NO_DATA(), + }, +}; + +DECLARE_UVERBS_NAMED_METHOD( + UVERBS_METHOD_DMAH_ALLOC, + UVERBS_ATTR_IDR(UVERBS_ATTR_ALLOC_DMAH_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_CPU_ID, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL), + UVERBS_ATTR_ENUM_IN(UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE, + uverbs_dmah_mem_type, + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMAH_PH, + UVERBS_ATTR_TYPE(u8), + UA_OPTIONAL)); + +DECLARE_UVERBS_NAMED_METHOD_DESTROY( + UVERBS_METHOD_DMAH_FREE, + UVERBS_ATTR_IDR(UVERBS_ATTR_FREE_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_DESTROY, + UA_MANDATORY)); + +DECLARE_UVERBS_NAMED_OBJECT(UVERBS_OBJECT_DMAH, + UVERBS_TYPE_ALLOC_IDR(uverbs_free_dmah), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_ALLOC), + &UVERBS_METHOD(UVERBS_METHOD_DMAH_FREE)); + +const struct uapi_definition uverbs_def_obj_dmah[] = { + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMAH, + UAPI_DEF_OBJ_NEEDS_FN(dealloc_dmah), + UAPI_DEF_OBJ_NEEDS_FN(alloc_dmah)), + {} +}; diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index a02916a3a79c..e00ea63175bd 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -631,6 +631,7 @@ static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), + UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 3fb1c963eeb0..9ad253687935 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -42,6 +42,7 @@ #include #include #include +#include #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN @@ -1846,6 +1847,27 @@ struct ib_dm { atomic_t usecnt; }; +/* bit values to mark existence of ib_dmah fields */ +enum { + IB_DMAH_CPU_ID_EXISTS, + IB_DMAH_MEM_TYPE_EXISTS, + IB_DMAH_PH_EXISTS, +}; + +struct ib_dmah { + struct ib_device *device; + struct ib_uobject *uobject; + /* + * Implementation details of the RDMA core, don't use in drivers: + */ + struct rdma_restrack_entry res; + u32 cpu_id; + enum tph_mem_type mem_type; + atomic_t usecnt; + u8 ph; + u8 valid_fields; /* use IB_DMAH_XXX_EXISTS */ +}; + struct ib_mr { struct ib_device *device; struct ib_pd *pd; @@ -2573,6 +2595,9 @@ struct ib_device_ops { struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); + int (*alloc_dmah)(struct ib_dmah *ibdmah, + struct uverbs_attr_bundle *attrs); + int (*dealloc_dmah)(struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, struct uverbs_attr_bundle *attrs); @@ -2730,6 +2755,7 @@ struct ib_device_ops { DECLARE_RDMA_OBJ_SIZE(ib_ah); DECLARE_RDMA_OBJ_SIZE(ib_counters); DECLARE_RDMA_OBJ_SIZE(ib_cq); + DECLARE_RDMA_OBJ_SIZE(ib_dmah); DECLARE_RDMA_OBJ_SIZE(ib_mw); DECLARE_RDMA_OBJ_SIZE(ib_pd); DECLARE_RDMA_OBJ_SIZE(ib_qp); diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 0d69ded73bf2..8a9bcf77dace 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -56,6 +56,10 @@ enum rdma_restrack_type { * @RDMA_RESTRACK_SRQ: Shared receive queue (SRQ) */ RDMA_RESTRACK_SRQ, + /** + * @RDMA_RESTRACK_DMAH: DMA handle + */ + RDMA_RESTRACK_DMAH, /** * @RDMA_RESTRACK_MAX: Last entry, used for array dclarations */ diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index ece923ab48a0..3bb72a259c29 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -55,6 +55,7 @@ enum uverbs_default_objects { UVERBS_OBJECT_DM, UVERBS_OBJECT_COUNTERS, UVERBS_OBJECT_ASYNC_EVENT, + UVERBS_OBJECT_DMAH, }; enum { @@ -240,6 +241,22 @@ enum uverbs_methods_dm { UVERBS_METHOD_DM_FREE, }; +enum uverbs_attrs_alloc_dmah_cmd_attr_ids { + UVERBS_ATTR_ALLOC_DMAH_HANDLE, + UVERBS_ATTR_ALLOC_DMAH_CPU_ID, + UVERBS_ATTR_ALLOC_DMAH_TPH_MEM_TYPE, + UVERBS_ATTR_ALLOC_DMAH_PH, +}; + +enum uverbs_attrs_free_dmah_cmd_attr_ids { + UVERBS_ATTR_FREE_DMA_HANDLE, +}; + +enum uverbs_methods_dmah { + UVERBS_METHOD_DMAH_ALLOC, + UVERBS_METHOD_DMAH_FREE, +}; + enum uverbs_attrs_reg_dm_mr_cmd_attr_ids { UVERBS_ATTR_REG_DM_MR_HANDLE, UVERBS_ATTR_REG_DM_MR_OFFSET, -- cgit v1.2.3 From a272019a46c918575f10cc529c893585d46b3b55 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 17 Jul 2025 15:17:31 +0300 Subject: IB: Extend UVERBS_METHOD_REG_MR to get DMAH Extend UVERBS_METHOD_REG_MR to get DMAH and pass it to all drivers. It will be used in mlx5 driver as part of the next patch from the series. Signed-off-by: Yishai Hadas Reviewed-by: Edward Srouji Link: https://patch.msgid.link/2ae1e628c0675db81f092cc00d3ad6fbf6139405.1752752567.git.leon@kernel.org Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_cmd.c | 2 +- drivers/infiniband/core/uverbs_std_types_mr.c | 27 ++++++++++++++++++++----- drivers/infiniband/core/verbs.c | 5 ++++- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 8 ++++++++ drivers/infiniband/hw/bnxt_re/ib_verbs.h | 2 ++ drivers/infiniband/hw/cxgb4/iw_cxgb4.h | 1 + drivers/infiniband/hw/cxgb4/mem.c | 6 +++++- drivers/infiniband/hw/efa/efa.h | 2 ++ drivers/infiniband/hw/efa/efa_verbs.c | 12 +++++++++++ drivers/infiniband/hw/erdma/erdma_verbs.c | 6 +++++- drivers/infiniband/hw/erdma/erdma_verbs.h | 3 ++- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_mr.c | 6 ++++++ drivers/infiniband/hw/irdma/verbs.c | 9 +++++++++ drivers/infiniband/hw/mana/mana_ib.h | 2 ++ drivers/infiniband/hw/mana/mr.c | 8 ++++++++ drivers/infiniband/hw/mlx4/mlx4_ib.h | 1 + drivers/infiniband/hw/mlx4/mr.c | 4 ++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 ++ drivers/infiniband/hw/mlx5/mr.c | 8 +++++--- drivers/infiniband/hw/mthca/mthca_provider.c | 6 +++++- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 6 +++++- drivers/infiniband/hw/ocrdma/ocrdma_verbs.h | 3 ++- drivers/infiniband/hw/qedr/verbs.c | 6 +++++- drivers/infiniband/hw/qedr/verbs.h | 3 ++- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 4 ++++ drivers/infiniband/hw/usnic/usnic_ib_verbs.h | 1 + drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c | 5 +++++ drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h | 1 + drivers/infiniband/sw/rdmavt/mr.c | 5 +++++ drivers/infiniband/sw/rdmavt/mr.h | 1 + drivers/infiniband/sw/rxe/rxe_verbs.c | 4 ++++ drivers/infiniband/sw/siw/siw_verbs.c | 7 ++++++- drivers/infiniband/sw/siw/siw_verbs.h | 3 ++- include/rdma/ib_verbs.h | 3 +++ include/uapi/rdma/ib_user_ioctl_cmds.h | 1 + 36 files changed, 154 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 88aa8d4599df..ce16404cdfb8 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -741,7 +741,7 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) } mr = pd->device->ops.reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va, - cmd.access_flags, + cmd.access_flags, NULL, &attrs->driver_udata); if (IS_ERR(mr)) { ret = PTR_ERR(mr); diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c index 1bd4b17b5515..570b9656801d 100644 --- a/drivers/infiniband/core/uverbs_std_types_mr.c +++ b/drivers/infiniband/core/uverbs_std_types_mr.c @@ -238,7 +238,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)( return ret; mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd, - access_flags, + access_flags, NULL, attrs); if (IS_ERR(mr)) return PTR_ERR(mr); @@ -276,6 +276,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( u32 valid_access_flags = IB_ACCESS_SUPPORTED; u64 length, iova, fd_offset = 0, addr = 0; struct ib_device *ib_dev = pd->device; + struct ib_dmah *dmah = NULL; bool has_fd_offset = false; bool has_addr = false; bool has_fd = false; @@ -340,6 +341,13 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( return -EINVAL; } + if (uverbs_attr_is_valid(attrs, UVERBS_ATTR_REG_MR_DMA_HANDLE)) { + dmah = uverbs_attr_get_obj(attrs, + UVERBS_ATTR_REG_MR_DMA_HANDLE); + if (IS_ERR(dmah)) + return PTR_ERR(dmah); + } + ret = uverbs_get_flags32(&access_flags, attrs, UVERBS_ATTR_REG_MR_ACCESS_FLAGS, valid_access_flags); @@ -351,11 +359,12 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( return ret; if (has_fd) - mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, iova, - fd, access_flags, attrs); + mr = pd->device->ops.reg_user_mr_dmabuf(pd, fd_offset, length, + iova, fd, access_flags, + dmah, attrs); else - mr = pd->device->ops.reg_user_mr(pd, addr, length, - iova, access_flags, NULL); + mr = pd->device->ops.reg_user_mr(pd, addr, length, iova, + access_flags, dmah, NULL); if (IS_ERR(mr)) return PTR_ERR(mr); @@ -365,6 +374,10 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_MR)( mr->type = IB_MR_TYPE_USER; mr->uobject = uobj; atomic_inc(&pd->usecnt); + if (dmah) { + mr->dmah = dmah; + atomic_inc(&dmah->usecnt); + } rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_set_name(&mr->res, NULL); rdma_restrack_add(&mr->res); @@ -488,6 +501,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_PD, UVERBS_ACCESS_READ, UA_MANDATORY), + UVERBS_ATTR_IDR(UVERBS_ATTR_REG_MR_DMA_HANDLE, + UVERBS_OBJECT_DMAH, + UVERBS_ACCESS_READ, + UA_OPTIONAL), UVERBS_ATTR_PTR_IN(UVERBS_ATTR_REG_MR_IOVA, UVERBS_ATTR_TYPE(u64), UA_MANDATORY), diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 75fde0fe9989..3a5f81402d2f 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2223,7 +2223,7 @@ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, - access_flags, NULL); + access_flags, NULL, NULL); if (IS_ERR(mr)) return mr; @@ -2262,6 +2262,7 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; + struct ib_dmah *dmah = mr->dmah; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; @@ -2272,6 +2273,8 @@ int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); + if (dmah) + atomic_dec(&dmah->usecnt); kfree(sig_attrs); } diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 3a627acb82ce..37c2bc3bdba5 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -4235,6 +4235,7 @@ free_mr: struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); @@ -4242,6 +4243,9 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_umem *umem; struct ib_mr *ib_mr; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags); if (IS_ERR(umem)) return ERR_CAST(umem); @@ -4255,6 +4259,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length, struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd); @@ -4263,6 +4268,9 @@ struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, struct ib_umem *umem; struct ib_mr *ib_mr; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + umem_dmabuf = ib_umem_dmabuf_get_pinned(&rdev->ibdev, start, length, fd, mr_access_flags); if (IS_ERR(umem_dmabuf)) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h index 22c9eb8e9cfc..fe00ab691a51 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h @@ -258,10 +258,12 @@ struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type, int bnxt_re_dealloc_mw(struct ib_mw *mw); struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start, u64 length, u64 virt_addr, int fd, int mr_access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata); void bnxt_re_dealloc_ucontext(struct ib_ucontext *context); diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 5b3007acaa1f..e17c1252536b 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -1006,6 +1006,7 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, void c4iw_dealloc(struct uld_ctx *ctx); struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index a2c71a1d93d5..dcdfe250bdbe 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -489,7 +489,8 @@ err_free_mhp: } struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *udata) + u64 virt, int acc, struct ib_dmah *dmah, + struct ib_udata *udata) { __be64 *pages; int shift, n, i; @@ -501,6 +502,9 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, pr_debug("ib_pd %p\n", pd); + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (length == ~0ULL) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h index 3d49c1db928e..96f9c3bc98b2 100644 --- a/drivers/infiniband/hw/efa/efa.h +++ b/drivers/infiniband/hw/efa/efa.h @@ -167,10 +167,12 @@ int efa_create_cq_umem(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct ib_umem *umem, struct uverbs_attr_bundle *attrs); struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num, diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c index 0f68aec12883..886923d5fe50 100644 --- a/drivers/infiniband/hw/efa/efa_verbs.c +++ b/drivers/infiniband/hw/efa/efa_verbs.c @@ -1765,6 +1765,7 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start, struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct efa_dev *dev = to_edev(ibpd->device); @@ -1772,6 +1773,11 @@ struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, struct efa_mr *mr; int err; + if (dmah) { + err = -EOPNOTSUPP; + goto err_out; + } + mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); @@ -1804,12 +1810,18 @@ err_out: struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct efa_dev *dev = to_edev(ibpd->device); struct efa_mr *mr; int err; + if (dmah) { + err = -EOPNOTSUPP; + goto err_out; + } + mr = efa_alloc_mr(ibpd, access_flags, udata); if (IS_ERR(mr)) { err = PTR_ERR(mr); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.c b/drivers/infiniband/hw/erdma/erdma_verbs.c index ec0ad4086066..94c211df09d8 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.c +++ b/drivers/infiniband/hw/erdma/erdma_verbs.c @@ -1200,13 +1200,17 @@ int erdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, } struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, - u64 virt, int access, struct ib_udata *udata) + u64 virt, int access, struct ib_dmah *dmah, + struct ib_udata *udata) { struct erdma_mr *mr = NULL; struct erdma_dev *dev = to_edev(ibpd->device); u32 stag; int ret; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (!len || len > dev->attrs.max_mr_size) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/erdma/erdma_verbs.h b/drivers/infiniband/hw/erdma/erdma_verbs.h index f9408ccc8bad..ef411b81fbd7 100644 --- a/drivers/infiniband/hw/erdma/erdma_verbs.h +++ b/drivers/infiniband/hw/erdma/erdma_verbs.h @@ -452,7 +452,8 @@ int erdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata); void erdma_disassociate_ucontext(struct ib_ucontext *ibcontext); int erdma_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, - u64 virt, int access, struct ib_udata *udata); + u64 virt, int access, struct ib_dmah *dmah, + struct ib_udata *udata); struct ib_mr *erdma_get_dma_mr(struct ib_pd *ibpd, int rights); int erdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *data); int erdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 25f77b1fa773..78ee04a48a74 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -1219,6 +1219,7 @@ int hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata); struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index ebef93559225..0f037e545520 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -231,12 +231,18 @@ err_free: struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); struct hns_roce_mr *mr; int ret; + if (dmah) { + ret = -EOPNOTSUPP; + goto err_out; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c index 1e8c92826de2..da5a41b275d8 100644 --- a/drivers/infiniband/hw/irdma/verbs.c +++ b/drivers/infiniband/hw/irdma/verbs.c @@ -3013,10 +3013,12 @@ static int irdma_reg_user_mr_type_cq(struct irdma_mem_reg_req req, * @len: length of mr * @virt: virtual address * @access: access of mr + * @dmah: dma handle * @udata: user data */ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, u64 virt, int access, + struct ib_dmah *dmah, struct ib_udata *udata) { #define IRDMA_MEM_REG_MIN_REQ_LEN offsetofend(struct irdma_mem_reg_req, sq_pages) @@ -3026,6 +3028,9 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, struct irdma_mr *iwmr = NULL; int err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) return ERR_PTR(-EINVAL); @@ -3085,6 +3090,7 @@ error: static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 len, u64 virt, int fd, int access, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct irdma_device *iwdev = to_iwdev(pd->device); @@ -3092,6 +3098,9 @@ static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, struct irdma_mr *iwmr; int err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h index 369825fdeff8..60a53f1958cf 100644 --- a/drivers/infiniband/hw/mana/mana_ib.h +++ b/drivers/infiniband/hw/mana/mana_ib.h @@ -632,6 +632,7 @@ struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags); struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); @@ -721,5 +722,6 @@ int mana_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags); struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int fd, int mr_access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); #endif diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c index 6d974d0a8400..55701046ffba 100644 --- a/drivers/infiniband/hw/mana/mr.c +++ b/drivers/infiniband/hw/mana/mr.c @@ -106,6 +106,7 @@ static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle) struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); @@ -116,6 +117,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 dma_region_handle; int err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); ibdev_dbg(ibdev, @@ -188,6 +192,7 @@ err_free: struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); @@ -199,6 +204,9 @@ struct ib_mr *mana_ib_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start, u64 leng u64 dma_region_handle; int err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + dev = container_of(ibdev, struct mana_ib_dev, ib_dev); access_flags &= ~IB_ACCESS_OPTIONAL; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index f53b1846594c..5df5b955114e 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -759,6 +759,7 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt, struct ib_umem *umem); struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata); int mlx4_ib_alloc_mw(struct ib_mw *mw, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index e77645a673fb..94464f1694d9 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -139,6 +139,7 @@ static struct ib_umem *mlx4_get_umem_mr(struct ib_device *device, u64 start, struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(pd->device); @@ -147,6 +148,9 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err; int n; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index ff6026d2ed7c..dfd231333509 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1370,10 +1370,12 @@ int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata); struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); int mlx5_ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 50f7a882efde..69f1403cc578 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1529,13 +1529,14 @@ err_dereg_mr: struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 iova, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem *umem; int err; - if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) + if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || dmah) return ERR_PTR(-EOPNOTSUPP); mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", @@ -1689,6 +1690,7 @@ end: struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs) { struct mlx5_ib_dev *dev = to_mdev(pd->device); @@ -1696,7 +1698,7 @@ struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, int err; if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || - !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) + !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) || dmah) return ERR_PTR(-EOPNOTSUPP); if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) { @@ -1903,7 +1905,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, */ recreate: return mlx5_ib_reg_user_mr(new_pd, start, length, iova, - new_access_flags, udata); + new_access_flags, NULL, udata); } static int diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 6a1e2e79ddc3..dd572d76866c 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -825,7 +825,8 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) } static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *udata) + u64 virt, int acc, struct ib_dmah *dmah, + struct ib_udata *udata) { struct mthca_dev *dev = to_mdev(pd->device); struct ib_block_iter biter; @@ -838,6 +839,9 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int err = 0; int write_mtt_size; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (udata->inlen < sizeof ucmd) { if (!context->reg_mr_warned) { mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n", diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 979de8f8df14..46d911fd38de 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -847,13 +847,17 @@ static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr) } struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, - u64 usr_addr, int acc, struct ib_udata *udata) + u64 usr_addr, int acc, struct ib_dmah *dmah, + struct ib_udata *udata) { int status = -ENOMEM; struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); struct ocrdma_mr *mr; struct ocrdma_pd *pd; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + pd = get_ocrdma_pd(ibpd); if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index 0644346d8d98..6c5c3755b8a9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -98,7 +98,8 @@ int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *, int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc); struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *); + u64 virt, int acc, struct ib_dmah *dmah, + struct ib_udata *); struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg); int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 568a5b18803f..ab9bf0922979 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2953,13 +2953,17 @@ done: } struct ib_mr *qedr_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len, - u64 usr_addr, int acc, struct ib_udata *udata) + u64 usr_addr, int acc, struct ib_dmah *dmah, + struct ib_udata *udata) { struct qedr_dev *dev = get_qedr_dev(ibpd->device); struct qedr_mr *mr; struct qedr_pd *pd; int rc = -ENOMEM; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + pd = get_qedr_pd(ibpd); DP_DEBUG(dev, QEDR_MSG_MR, "qedr_register user mr pd = %d start = %lld, len = %lld, usr_addr = %lld, acc = %d\n", diff --git a/drivers/infiniband/hw/qedr/verbs.h b/drivers/infiniband/hw/qedr/verbs.h index 5731458abb06..62420a15101b 100644 --- a/drivers/infiniband/hw/qedr/verbs.h +++ b/drivers/infiniband/hw/qedr/verbs.h @@ -79,7 +79,8 @@ int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata); struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc); struct ib_mr *qedr_reg_user_mr(struct ib_pd *, u64 start, u64 length, - u64 virt, int acc, struct ib_udata *); + u64 virt, int acc, struct ib_dmah *dmah, + struct ib_udata *); int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 217af34e82b3..ae5df96589d9 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -592,6 +592,7 @@ int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata) struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct usnic_ib_mr *mr; @@ -600,6 +601,9 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start, virt_addr, length); + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 53f53f2d53be..e3031ac32488 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -60,6 +60,7 @@ int usnic_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata); struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c index e80848bfb3bd..ec7a00c8285b 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c @@ -104,12 +104,14 @@ struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc) * @length: length of region * @virt_addr: I/O virtual address * @access_flags: access flags for memory region + * @dmah: dma handle * @udata: user data * * @return: ib_mr pointer on success, otherwise returns an errno. */ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct pvrdma_dev *dev = to_vdev(pd->device); @@ -121,6 +123,9 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp; int ret, npages; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (length == 0 || length > dev->dsr->caps.max_mr_size) { dev_warn(&dev->pdev->dev, "invalid mem region length\n"); return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h index fd47b0b1df5c..603e5a9311eb 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h @@ -366,6 +366,7 @@ int pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata); struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata); struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/sw/rdmavt/mr.c b/drivers/infiniband/sw/rdmavt/mr.c index 5ed5cfc2b280..86e482593a85 100644 --- a/drivers/infiniband/sw/rdmavt/mr.c +++ b/drivers/infiniband/sw/rdmavt/mr.c @@ -329,12 +329,14 @@ bail: * @length: length of region to register * @virt_addr: associated virtual address * @mr_access_flags: access flags for this memory region + * @dmah: dma handle * @udata: unused by the driver * * Return: the memory region on success, otherwise returns an errno. */ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata) { struct rvt_mr *mr; @@ -343,6 +345,9 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int n, m; struct ib_mr *ret; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (length == 0) return ERR_PTR(-EINVAL); diff --git a/drivers/infiniband/sw/rdmavt/mr.h b/drivers/infiniband/sw/rdmavt/mr.h index 44afe2731741..72dab48307b7 100644 --- a/drivers/infiniband/sw/rdmavt/mr.h +++ b/drivers/infiniband/sw/rdmavt/mr.h @@ -26,6 +26,7 @@ void rvt_mr_exit(struct rvt_dev_info *rdi); struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc); struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata); struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index 2331e698a65b..f48d6e132954 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1271,6 +1271,7 @@ err_free: static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, + struct ib_dmah *dmah, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); @@ -1278,6 +1279,9 @@ static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, struct rxe_mr *mr; int err, cleanup_err; + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); diff --git a/drivers/infiniband/sw/siw/siw_verbs.c b/drivers/infiniband/sw/siw/siw_verbs.c index 2b2a7b8e93b0..35c3bde0d00a 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.c +++ b/drivers/infiniband/sw/siw/siw_verbs.c @@ -1321,10 +1321,12 @@ int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) * @len: len of MR * @rnic_va: not used by siw * @rights: MR access rights + * @dmah: dma handle * @udata: user buffer to communicate STag and Key. */ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, - u64 rnic_va, int rights, struct ib_udata *udata) + u64 rnic_va, int rights, struct ib_dmah *dmah, + struct ib_udata *udata) { struct siw_mr *mr = NULL; struct siw_umem *umem = NULL; @@ -1336,6 +1338,9 @@ struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, (unsigned long long)len); + if (dmah) + return ERR_PTR(-EOPNOTSUPP); + if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { siw_dbg_pd(pd, "too many mr's\n"); rv = -ENOMEM; diff --git a/drivers/infiniband/sw/siw/siw_verbs.h b/drivers/infiniband/sw/siw/siw_verbs.h index 1f1a305540af..e9f4463aecdc 100644 --- a/drivers/infiniband/sw/siw/siw_verbs.h +++ b/drivers/infiniband/sw/siw/siw_verbs.h @@ -65,7 +65,8 @@ int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata); int siw_poll_cq(struct ib_cq *base_cq, int num_entries, struct ib_wc *wc); int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags); struct ib_mr *siw_reg_user_mr(struct ib_pd *base_pd, u64 start, u64 len, - u64 rnic_va, int rights, struct ib_udata *udata); + u64 rnic_va, int rights, struct ib_dmah *dmah, + struct ib_udata *udata); struct ib_mr *siw_alloc_mr(struct ib_pd *base_pd, enum ib_mr_type mr_type, u32 max_sge); struct ib_mr *siw_get_dma_mr(struct ib_pd *base_pd, int rights); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9ad253687935..6139223e92e4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1885,6 +1885,7 @@ struct ib_mr { struct ib_dm *dm; struct ib_sig_attrs *sig_attrs; /* only for IB_MR_TYPE_INTEGRITY MRs */ + struct ib_dmah *dmah; /* * Implementation details of the RDMA core, don't use in drivers: */ @@ -2527,10 +2528,12 @@ struct ib_device_ops { struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, + struct ib_dmah *dmah, struct ib_udata *udata); struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset, u64 length, u64 virt_addr, int fd, int mr_access_flags, + struct ib_dmah *dmah, struct uverbs_attr_bundle *attrs); struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, diff --git a/include/uapi/rdma/ib_user_ioctl_cmds.h b/include/uapi/rdma/ib_user_ioctl_cmds.h index 3bb72a259c29..de6f5a94f1e3 100644 --- a/include/uapi/rdma/ib_user_ioctl_cmds.h +++ b/include/uapi/rdma/ib_user_ioctl_cmds.h @@ -311,6 +311,7 @@ enum uverbs_attrs_reg_dmabuf_mr_cmd_attr_ids { enum uverbs_attrs_reg_mr_cmd_attr_ids { UVERBS_ATTR_REG_MR_HANDLE, UVERBS_ATTR_REG_MR_PD_HANDLE, + UVERBS_ATTR_REG_MR_DMA_HANDLE, UVERBS_ATTR_REG_MR_IOVA, UVERBS_ATTR_REG_MR_ADDR, UVERBS_ATTR_REG_MR_LENGTH, -- cgit v1.2.3