summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
diff options
context:
space:
mode:
authorDave Airlie <airlied@redhat.com>2025-03-10 01:19:25 +0300
committerDave Airlie <airlied@redhat.com>2025-03-10 02:04:52 +0300
commit236f475d29f8e585a72fb6fac7f8bb4dc4b162b7 (patch)
treea5b2d1e4bbb4a8f808aa351f386c432e6cb2102a /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parentd65a27f95f6ab236b1a788d9bc463d24a8b2aebe (diff)
parentcf6d949a409e09539477d32dbe7c954e4852e744 (diff)
downloadlinux-236f475d29f8e585a72fb6fac7f8bb4dc4b162b7.tar.xz
Merge tag 'amd-drm-next-6.15-2025-03-07' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amdgpu: - Fix spelling typos - RAS updates - VCN 5.0.1 updates - SubVP fixes - DCN 4.0.1 fixes - MSO DPCD fixes - DIO encoder refactor - PCON fixes - Misc cleanups - DMCUB fixes - USB4 DP fixes - DM cleanups - Backlight cleanups and fixes - Support platform backlight curves - Misc code cleanups - SMU 14 fixes - JPEG 4.0.3 reset updates - SR-IOV fixes - SVM fixes - GC 12 DCC fixes - DC DCE 6.x fix - Hiberation fix amdkfd: - Fix possible NULL pointer in queue validation - Remove unnecessary CP domain validation - SDMA queue reset support - Add per process flags radeon: - Fix spelling typos - RS400 hyperZ fix UAPI: - Add KFD per process flags for setting precision Proposed user space: https://github.com/ROCm/ROCR-Runtime/commit/2a64fa5e06e80e0af36df4ce0c76ae52eeec0a9d Signed-off-by: Dave Airlie <airlied@redhat.com> From: Alex Deucher <alexander.deucher@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250307211051.1880472-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_job.c30
1 files changed, 24 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 1899c601c95c..1d26be3c6d9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -130,29 +130,47 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
amdgpu_vm_put_task_info(ti);
}
- dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
-
/* attempt a per ring reset */
- if (amdgpu_gpu_recovery &&
- ring->funcs->reset) {
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "Ring reset disabled by debug mask\n");
+ } else if (amdgpu_gpu_recovery && ring->funcs->reset) {
+ bool is_guilty;
+
dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name);
/* stop the scheduler, but don't mess with the
* bad job yet because if ring reset fails
* we'll fall back to full GPU reset.
*/
drm_sched_wqueue_stop(&ring->sched);
+
+ /* for engine resets, we need to reset the engine,
+ * but individual queues may be unaffected.
+ * check here to make sure the accounting is correct.
+ */
+ if (ring->funcs->is_guilty)
+ is_guilty = ring->funcs->is_guilty(ring);
+ else
+ is_guilty = true;
+
+ if (is_guilty)
+ dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
+
r = amdgpu_ring_reset(ring, job->vmid);
if (!r) {
if (amdgpu_ring_sched_ready(ring))
drm_sched_stop(&ring->sched, s_job);
- atomic_inc(&ring->adev->gpu_reset_counter);
- amdgpu_fence_driver_force_completion(ring);
+ if (is_guilty) {
+ atomic_inc(&ring->adev->gpu_reset_counter);
+ amdgpu_fence_driver_force_completion(ring);
+ }
if (amdgpu_ring_sched_ready(ring))
drm_sched_start(&ring->sched, 0);
+ dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
goto exit;
}
dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
}
+ dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
if (amdgpu_device_should_recover_gpu(ring->adev)) {
struct amdgpu_reset_context reset_context;