From a789761de3053d25f03787ac40897dbea14ee368 Mon Sep 17 00:00:00 2001 From: Benjamin Welton Date: Mon, 9 Feb 2026 00:42:00 +0800 Subject: amd/amdkfd: Add kfd_ioctl_profiler to contain profiler kernel driver changes kfd_ioctl_profiler takes a similar approach to that of kfd_ioctl_dbg_trap (which contains debugger related IOCTL services) where kfd_ioctl_profiler will contain all profiler related IOCTL services. The IOCTL is designed to be expanded as needed to support additional profiler functionality. The current functionality of the IOCTL is to allow for profilers which need PMC counters from GPU devices to both signal to other profilers that may be on the system that the device has active PMC profiling taking place on it (multiple PMC profilers on the same device can result in corrupted counter data) and to setup the device to allow for the collection of SQ PMC data on all queues on the device. For PMC data for the SQ block (such as SQ_WAVES) to be available to a profiler, mmPERFCOUNT_ENABLE must be set on the queues. When profiling a single process, the profiler can inject PM4 packets into each queue to turn on PERFCOUNT_ENABLE. When profiling system wide, the profiler does not have this option and must have a way to turn on profiling for queues in which it cannot inject packets into directly. Accomplishing this requires a few steps: 1. Checking if the user has the necessary permissions to profile system wide on the device. This check uses the same check that linux perf uses to determine if a user has the necessary permissions to profile at this scope (primarily if the process has CAP_SYS_PERFMON or is root). 2. Locking the device for profiling. This is done by setting a lock bit on the device struct and storing the process that locked the device. 3. Iterating all queues on the device and issuing an MQD Update to enable perfcounting on the queues. 4. Actions to cleanup if the process exits or releases the lock. The IOCTL also contains a link to the existing PC Sampling IOCTL as well. This is per a suggestion that we should potentially remove the PC Sampling IOCTL to have it be a part of the profiler IOCTL. This is a future change. In addition, we do expect to expand the profiler IOCTL to include additional profiler functionality in the future (which necessitates the use of a version number). v2: sqaush in proper IOCTL number Proposed userpace support: https://github.com/ROCm/rocm-systems/commit/40abc95a6463a61bb318a67efd6d9cc3e5ee8839 Signed-off-by: Benjamin Welton Signed-off-by: Perry Yuan Acked-by: Kent Russell Reviewed-by: Yifan Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index e72359370857..cc3ed0765c83 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -1558,6 +1558,29 @@ struct kfd_ioctl_dbg_trap_args { }; }; +#define KFD_IOC_PROFILER_VERSION_NUM 1 +enum kfd_profiler_ops { + KFD_IOC_PROFILER_PMC = 0, + KFD_IOC_PROFILER_VERSION = 2, +}; + +/** + * Enables/Disables GPU Specific profiler settings + */ +struct kfd_ioctl_pmc_settings { + __u32 gpu_id; /* This is the user_gpu_id */ + __u32 lock; /* Lock GPU for Profiling */ + __u32 perfcount_enable; /* Force Perfcount Enable for queues on GPU */ +}; + +struct kfd_ioctl_profiler_args { + __u32 op; /* kfd_profiler_op */ + union { + struct kfd_ioctl_pmc_settings pmc; + __u32 version; /* KFD_IOC_PROFILER_VERSION_NUM */ + }; +}; + #define AMDKFD_IOCTL_BASE 'K' #define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) #define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) @@ -1681,7 +1704,10 @@ struct kfd_ioctl_dbg_trap_args { #define AMDKFD_IOC_CREATE_PROCESS \ AMDKFD_IO(0x27) +#define AMDKFD_IOC_PROFILER \ + AMDKFD_IOWR(0x28, struct kfd_ioctl_profiler_args) + #define AMDKFD_COMMAND_START 0x01 -#define AMDKFD_COMMAND_END 0x28 +#define AMDKFD_COMMAND_END 0x29 #endif -- cgit v1.2.3 From dd61e27535a6f5cfb32a847b282d2e3d5aebf46f Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Mon, 9 Feb 2026 00:42:07 +0800 Subject: drm/amdkfd: Add PTL control IOCTL Option and unify refcount logic Introduce a new IOCTL option to allow userspace explicit control over the Peak Tops Limiter (PTL) state for profiling Link: https://github.com/ROCm/rocm-systems/tree/develop/projects/rocprofiler-sdk Signed-off-by: Perry Yuan Reviewed-by: Yifan Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 102 +++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8 +++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 ++ drivers/gpu/drm/amd/include/amdgpu_ptl.h | 2 + include/uapi/linux/kfd_ioctl.h | 7 +++ 6 files changed, 125 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 467a3dbe1bfa..aab6a4de54fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -2400,6 +2400,8 @@ static int gfx_v9_4_3_perf_monitor_ptl_init(struct amdgpu_device *adev, bool ena ptl->hw_supported = true; + atomic_set(&ptl->disable_ref, 0); + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index fc00d0418684..883de31df04d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1774,6 +1774,104 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) } #endif +static int kfd_ptl_control(struct kfd_process_device *pdd, bool enable) +{ + struct amdgpu_device *adev = pdd->dev->adev; + struct amdgpu_ptl *ptl = &adev->psp.ptl; + enum amdgpu_ptl_fmt pref_format1 = ptl->fmt1; + enum amdgpu_ptl_fmt pref_format2 = ptl->fmt2; + uint32_t ptl_state = enable ? 1 : 0; + int ret; + + if (!ptl->hw_supported) + return -EOPNOTSUPP; + + if (!pdd->dev->kfd2kgd || !pdd->dev->kfd2kgd->ptl_ctrl) + return -EOPNOTSUPP; + + ret = pdd->dev->kfd2kgd->ptl_ctrl(adev, PSP_PTL_PERF_MON_SET, + &ptl_state, + &pref_format1, + &pref_format2); + return ret; +} + +int kfd_ptl_disable_request(struct kfd_process_device *pdd, + struct kfd_process *p) +{ + struct amdgpu_device *adev = pdd->dev->adev; + struct amdgpu_ptl *ptl = &adev->psp.ptl; + int ret = 0; + + mutex_lock(&ptl->mutex); + + if (pdd->ptl_disable_req) + goto out; + + if (atomic_inc_return(&ptl->disable_ref) == 1) { + ret = kfd_ptl_control(pdd, false); + if (ret) { + atomic_dec(&ptl->disable_ref); + dev_warn(pdd->dev->adev->dev, + "failed to disable PTL\n"); + goto out; + } + } + pdd->ptl_disable_req = true; + +out: + mutex_unlock(&ptl->mutex); + return ret; +} + +int kfd_ptl_disable_release(struct kfd_process_device *pdd, + struct kfd_process *p) +{ + struct amdgpu_device *adev = pdd->dev->adev; + struct amdgpu_ptl *ptl = &adev->psp.ptl; + int ret = 0; + + mutex_lock(&ptl->mutex); + + if (!pdd->ptl_disable_req) + goto out; + + if (atomic_dec_return(&ptl->disable_ref) == 0) { + ret = kfd_ptl_control(pdd, true); + if (ret) { + atomic_inc(&ptl->disable_ref); + dev_warn(adev->dev, "Failed to enable PTL on release: %d\n", ret); + goto out; + } + } + pdd->ptl_disable_req = false; + +out: + mutex_unlock(&ptl->mutex); + return ret; +} + +static int kfd_profiler_ptl_control(struct kfd_process *p, + struct kfd_ioctl_ptl_control *args) +{ + struct kfd_process_device *pdd; + int ret; + + mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + mutex_unlock(&p->mutex); + + if (!pdd || !pdd->dev || !pdd->dev->kfd) + return -EINVAL; + + if (args->enable == 0) + ret = kfd_ptl_disable_request(pdd, p); + else + ret = kfd_ptl_disable_release(pdd, p); + + return ret; +} + static int criu_checkpoint_process(struct kfd_process *p, uint8_t __user *user_priv_data, uint64_t *priv_offset) @@ -3242,6 +3340,7 @@ static inline uint32_t profile_lock_device(struct kfd_process *p, if (!kfd->profiler_process) { kfd->profiler_process = p; status = 0; + kfd_ptl_disable_request(pdd, p); } else if (kfd->profiler_process == p) { status = -EALREADY; } else { @@ -3250,6 +3349,7 @@ static inline uint32_t profile_lock_device(struct kfd_process *p, } else if (op == 0 && kfd->profiler_process == p) { kfd->profiler_process = NULL; status = 0; + kfd_ptl_disable_release(pdd, p); } mutex_unlock(&kfd->profiler_lock); @@ -3292,6 +3392,8 @@ static int kfd_ioctl_profiler(struct file *filep, struct kfd_process *p, void *d return 0; case KFD_IOC_PROFILER_PMC: return kfd_profiler_pmc(p, &args->pmc); + case KFD_IOC_PROFILER_PTL_CONTROL: + return kfd_profiler_ptl_control(p, &args->ptl); } return -EINVAL; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 903386e0740b..482bcfa10f82 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -872,6 +872,8 @@ struct kfd_process_device { bool has_reset_queue; u32 pasid; + /* Indicates this process has requested PTL stay disabled */ + bool ptl_disable_req; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -1603,6 +1605,12 @@ static inline bool kfd_is_first_node(struct kfd_node *node) return (node == node->kfd->nodes[0]); } +/* PTL support */ +int kfd_ptl_disable_request(struct kfd_process_device *pdd, + struct kfd_process *p); +int kfd_ptl_disable_release(struct kfd_process_device *pdd, + struct kfd_process *p); + /* Debugfs */ #if defined(CONFIG_DEBUG_FS) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 1a8cb512dfe3..368283d53077 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1128,6 +1128,10 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) pr_debug("Releasing pdd (topology id %d, for pid %d)\n", pdd->dev->id, p->lead_thread->pid); kfd_process_profiler_release(p, pdd); + + if (pdd->ptl_disable_req) + kfd_ptl_disable_release(pdd, p); + kfd_process_device_destroy_cwsr_dgpu(pdd); kfd_process_device_destroy_ib_mem(pdd); diff --git a/drivers/gpu/drm/amd/include/amdgpu_ptl.h b/drivers/gpu/drm/amd/include/amdgpu_ptl.h index ffed443a14ae..9e63a9a9680a 100644 --- a/drivers/gpu/drm/amd/include/amdgpu_ptl.h +++ b/drivers/gpu/drm/amd/include/amdgpu_ptl.h @@ -39,6 +39,8 @@ struct amdgpu_ptl { enum amdgpu_ptl_fmt fmt2; bool enabled; bool hw_supported; + /* PTL disable reference counting */ + atomic_t disable_ref; struct mutex mutex; }; diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index cc3ed0765c83..1a94d512df35 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -1562,6 +1562,7 @@ struct kfd_ioctl_dbg_trap_args { enum kfd_profiler_ops { KFD_IOC_PROFILER_PMC = 0, KFD_IOC_PROFILER_VERSION = 2, + KFD_IOC_PROFILER_PTL_CONTROL = 3, }; /** @@ -1573,10 +1574,16 @@ struct kfd_ioctl_pmc_settings { __u32 perfcount_enable; /* Force Perfcount Enable for queues on GPU */ }; +struct kfd_ioctl_ptl_control { + __u32 gpu_id; /* user_gpu_id */ + __u32 enable; /* set 1 to enable PTL, set 0 to disable PTL */ +}; + struct kfd_ioctl_profiler_args { __u32 op; /* kfd_profiler_op */ union { struct kfd_ioctl_pmc_settings pmc; + struct kfd_ioctl_ptl_control ptl; __u32 version; /* KFD_IOC_PROFILER_VERSION_NUM */ }; }; -- cgit v1.2.3 From c62c076d2d64ead542c961cabed0f9467d7d6026 Mon Sep 17 00:00:00 2001 From: Perry Yuan Date: Wed, 15 Apr 2026 10:34:03 +0800 Subject: drm/amdkfd: bump KFD ioctl minor version to 1.23 Bump `KFD_IOCTL_MINOR_VERSION` from 22 to 23 and document version 1.23 in `kfd_ioctl.h` so userspace can detect profiler ioctl support. Signed-off-by: Perry Yuan Suggested-by: Alex Deucher Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- include/uapi/linux/kfd_ioctl.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h index 1a94d512df35..9584b5aab727 100644 --- a/include/uapi/linux/kfd_ioctl.h +++ b/include/uapi/linux/kfd_ioctl.h @@ -48,9 +48,10 @@ * - 1.20 - Trap handler support for expert scheduling mode available * - 1.21 - Debugger support to subscribe to LDS out-of-address exceptions * - 1.22 - Add queue creation with metadata ring base address + * - 1.23 - Add profiler control ioctl to enable/disable profiler on a process */ #define KFD_IOCTL_MAJOR_VERSION 1 -#define KFD_IOCTL_MINOR_VERSION 22 +#define KFD_IOCTL_MINOR_VERSION 23 struct kfd_ioctl_get_version_args { __u32 major_version; /* from KFD */ -- cgit v1.2.3 From 7b5121c3374e24c8f6490b54f347eb06ee16028c Mon Sep 17 00:00:00 2001 From: Sergio Lopez Date: Tue, 28 Apr 2026 21:44:48 +0200 Subject: drm/virtio: support VIRTIO_GPU_F_BLOB_ALIGNMENT Support VIRTIO_GPU_F_BLOB_ALIGNMENT, a feature that indicates the device provides a valid blob_alignment field in its configuration, and that both RESOURCE_CREATE_BLOB and RESOURCE_MAP_BLOB requests must be aligned to that value. Signed-off-by: Sergio Lopez Signed-off-by: Dmitry Osipenko Link: https://patch.msgid.link/20260428194450.518296-2-slp@redhat.com --- drivers/gpu/drm/virtio/virtgpu_drv.c | 1 + drivers/gpu/drm/virtio/virtgpu_drv.h | 2 ++ drivers/gpu/drm/virtio/virtgpu_kms.c | 14 +++++++++++--- include/uapi/linux/virtio_gpu.h | 9 +++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c index a5ce96fb8a1d..812ee3f5e4aa 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.c +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c @@ -163,6 +163,7 @@ static unsigned int features[] = { VIRTIO_GPU_F_RESOURCE_UUID, VIRTIO_GPU_F_RESOURCE_BLOB, VIRTIO_GPU_F_CONTEXT_INIT, + VIRTIO_GPU_F_BLOB_ALIGNMENT, }; static struct virtio_driver virtio_gpu_driver = { .feature_table = features, diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.h b/drivers/gpu/drm/virtio/virtgpu_drv.h index 6f49213e23f8..04fe15d877cd 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.h +++ b/drivers/gpu/drm/virtio/virtgpu_drv.h @@ -258,6 +258,7 @@ struct virtio_gpu_device { bool has_resource_blob; bool has_host_visible; bool has_context_init; + bool has_blob_alignment; struct virtio_shm_region host_visible_region; struct drm_mm host_visible_mm; @@ -271,6 +272,7 @@ struct virtio_gpu_device { uint32_t num_capsets; uint64_t capset_id_mask; struct list_head cap_cache; + uint32_t blob_alignment; /* protects uuid state when exporting */ spinlock_t resource_export_lock; diff --git a/drivers/gpu/drm/virtio/virtgpu_kms.c b/drivers/gpu/drm/virtio/virtgpu_kms.c index 80ba69b4860b..cfde9f573df6 100644 --- a/drivers/gpu/drm/virtio/virtgpu_kms.c +++ b/drivers/gpu/drm/virtio/virtgpu_kms.c @@ -124,7 +124,7 @@ int virtio_gpu_init(struct virtio_device *vdev, struct drm_device *dev) struct virtio_gpu_device *vgdev; /* this will expand later */ struct virtqueue *vqs[2]; - u32 num_scanouts, num_capsets; + u32 num_scanouts, num_capsets, blob_alignment; int ret = 0; if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) @@ -198,14 +198,22 @@ int virtio_gpu_init(struct virtio_device *vdev, struct drm_device *dev) if (virtio_has_feature(vgdev->vdev, VIRTIO_GPU_F_CONTEXT_INIT)) vgdev->has_context_init = true; + if (virtio_has_feature(vgdev->vdev, VIRTIO_GPU_F_BLOB_ALIGNMENT)) { + vgdev->has_blob_alignment = true; + virtio_cread_le(vgdev->vdev, struct virtio_gpu_config, + blob_alignment, &blob_alignment); + vgdev->blob_alignment = blob_alignment; + } + DRM_INFO("features: %cvirgl %cedid %cresource_blob %chost_visible", vgdev->has_virgl_3d ? '+' : '-', vgdev->has_edid ? '+' : '-', vgdev->has_resource_blob ? '+' : '-', vgdev->has_host_visible ? '+' : '-'); - DRM_INFO("features: %ccontext_init\n", - vgdev->has_context_init ? '+' : '-'); + DRM_INFO("features: %ccontext_init %cblob_alignment\n", + vgdev->has_context_init ? '+' : '-', + vgdev->has_blob_alignment ? '+' : '-'); ret = virtio_find_vqs(vgdev->vdev, 2, vqs, vqs_info, NULL); if (ret) { diff --git a/include/uapi/linux/virtio_gpu.h b/include/uapi/linux/virtio_gpu.h index be109777d10d..4f530d90058c 100644 --- a/include/uapi/linux/virtio_gpu.h +++ b/include/uapi/linux/virtio_gpu.h @@ -64,6 +64,14 @@ * context_init and multiple timelines */ #define VIRTIO_GPU_F_CONTEXT_INIT 4 +/* + * The device provides a valid blob_alignment + * field in its configuration and both + * VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB and + * VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB requests + * must be aligned to that value. + */ +#define VIRTIO_GPU_F_BLOB_ALIGNMENT 5 enum virtio_gpu_ctrl_type { VIRTIO_GPU_UNDEFINED = 0, @@ -365,6 +373,7 @@ struct virtio_gpu_config { __le32 events_clear; __le32 num_scanouts; __le32 num_capsets; + __le32 blob_alignment; }; /* simple formats for fbcon/X use */ -- cgit v1.2.3