diff options
author | Dave Airlie <airlied@redhat.com> | 2025-03-10 01:19:25 +0300 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2025-03-10 02:04:52 +0300 |
commit | 236f475d29f8e585a72fb6fac7f8bb4dc4b162b7 (patch) | |
tree | a5b2d1e4bbb4a8f808aa351f386c432e6cb2102a /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | |
parent | d65a27f95f6ab236b1a788d9bc463d24a8b2aebe (diff) | |
parent | cf6d949a409e09539477d32dbe7c954e4852e744 (diff) | |
download | linux-236f475d29f8e585a72fb6fac7f8bb4dc4b162b7.tar.xz |
Merge tag 'amd-drm-next-6.15-2025-03-07' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amdgpu:
- Fix spelling typos
- RAS updates
- VCN 5.0.1 updates
- SubVP fixes
- DCN 4.0.1 fixes
- MSO DPCD fixes
- DIO encoder refactor
- PCON fixes
- Misc cleanups
- DMCUB fixes
- USB4 DP fixes
- DM cleanups
- Backlight cleanups and fixes
- Support platform backlight curves
- Misc code cleanups
- SMU 14 fixes
- JPEG 4.0.3 reset updates
- SR-IOV fixes
- SVM fixes
- GC 12 DCC fixes
- DC DCE 6.x fix
- Hiberation fix
amdkfd:
- Fix possible NULL pointer in queue validation
- Remove unnecessary CP domain validation
- SDMA queue reset support
- Add per process flags
radeon:
- Fix spelling typos
- RS400 hyperZ fix
UAPI:
- Add KFD per process flags for setting precision
Proposed user space: https://github.com/ROCm/ROCR-Runtime/commit/2a64fa5e06e80e0af36df4ce0c76ae52eeec0a9d
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20250307211051.1880472-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 30 |
1 files changed, 24 insertions, 6 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 1899c601c95c..1d26be3c6d9d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -130,29 +130,47 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) amdgpu_vm_put_task_info(ti); } - dma_fence_set_error(&s_job->s_fence->finished, -ETIME); - /* attempt a per ring reset */ - if (amdgpu_gpu_recovery && - ring->funcs->reset) { + if (unlikely(adev->debug_disable_gpu_ring_reset)) { + dev_err(adev->dev, "Ring reset disabled by debug mask\n"); + } else if (amdgpu_gpu_recovery && ring->funcs->reset) { + bool is_guilty; + dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); /* stop the scheduler, but don't mess with the * bad job yet because if ring reset fails * we'll fall back to full GPU reset. */ drm_sched_wqueue_stop(&ring->sched); + + /* for engine resets, we need to reset the engine, + * but individual queues may be unaffected. + * check here to make sure the accounting is correct. + */ + if (ring->funcs->is_guilty) + is_guilty = ring->funcs->is_guilty(ring); + else + is_guilty = true; + + if (is_guilty) + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + r = amdgpu_ring_reset(ring, job->vmid); if (!r) { if (amdgpu_ring_sched_ready(ring)) drm_sched_stop(&ring->sched, s_job); - atomic_inc(&ring->adev->gpu_reset_counter); - amdgpu_fence_driver_force_completion(ring); + if (is_guilty) { + atomic_inc(&ring->adev->gpu_reset_counter); + amdgpu_fence_driver_force_completion(ring); + } if (amdgpu_ring_sched_ready(ring)) drm_sched_start(&ring->sched, 0); + dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name); goto exit; } dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name); } + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); if (amdgpu_device_should_recover_gpu(ring->adev)) { struct amdgpu_reset_context reset_context; |