summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
diff options
context:
space:
mode:
authorJesse.Zhang <Jesse.Zhang@amd.com>2025-09-04 04:39:34 +0300
committerAlex Deucher <alexander.deucher@amd.com>2025-09-06 00:38:31 +0300
commit78e1222fbf0026de99420bc35c6c12931e0f36ee (patch)
tree06b6aa848f3ddf39e78311fe2fe315470fb2242e /drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
parent6abd725fdfc418abdf3aae0a1b19cb483305c723 (diff)
downloadlinux-78e1222fbf0026de99420bc35c6c12931e0f36ee.tar.xz
drm/amdgpu/mes: add front end for detect and reset hung queue
Helper function to detect and reset hung queues. MES will return an array of doorbell indices of which queues are hung and were optionally reset. v2: Clear the doorbell array before detection Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c65
1 files changed, 65 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 135598502c8d..5bf9be073cdd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error_doorbell;
+ if (adev->mes.hung_queue_db_array_size) {
+ r = amdgpu_bo_create_kernel(adev,
+ adev->mes.hung_queue_db_array_size * sizeof(u32),
+ PAGE_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+ if (r) {
+ dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r);
+ goto error_doorbell;
+ }
+ }
+
return 0;
error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
{
int i;
+ amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+
amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
&adev->mes.event_log_gpu_addr,
&adev->mes.event_log_cpu_addr);
@@ -366,6 +384,53 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
return r;
}
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
+{
+ return adev->mes.hung_queue_db_array_size;
+}
+
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+ int queue_type,
+ bool detect_only,
+ unsigned int *hung_db_num,
+ u32 *hung_db_array)
+
+{
+ struct mes_detect_and_reset_queue_input input;
+ u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
+ int r, i;
+
+ if (!hung_db_num || !hung_db_array)
+ return -EINVAL;
+
+ if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
+ (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
+ (queue_type != AMDGPU_RING_TYPE_SDMA))
+ return -EINVAL;
+
+ /* Clear the doorbell array before detection */
+ memset(adev->mes.hung_queue_db_array_cpu_addr, 0,
+ adev->mes.hung_queue_db_array_size * sizeof(u32));
+ input.queue_type = queue_type;
+ input.detect_only = detect_only;
+
+ r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
+ &input);
+ if (r) {
+ dev_err(adev->dev, "failed to detect and reset\n");
+ } else {
+ *hung_db_num = 0;
+ for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+ if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+ hung_db_array[i] = db_array[i];
+ *hung_db_num += 1;
+ }
+ }
+ }
+
+ return r;
+}
+
uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
{
struct mes_misc_op_input op_input;