diff options
author | Lijo Lazar <lijo.lazar@amd.com> | 2024-02-22 11:46:57 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-02-26 19:14:24 +0300 |
commit | 1b6ef74b2b03b54776778476f8adf87dd4f8beb1 (patch) | |
tree | 94795ef19a86666ce2be38638096f85f27aecb2c /drivers/gpu/drm/amd/amdgpu | |
parent | 34b811a281bab42d09592fdaa6885f4f41352bd3 (diff) | |
download | linux-1b6ef74b2b03b54776778476f8adf87dd4f8beb1.tar.xz |
drm/amdgpu: Add fatal error detected flag
For a RAS error that needs a full reset to recover, set the fatal error
status. Clear the status once the device is reset.
Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 6 |
3 files changed, 39 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b0ea4ddc8e72..d88309228a26 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5321,6 +5321,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ + amdgpu_ras_set_fed(tmp_adev, false); r = amdgpu_device_asic_init(tmp_adev); if (r) { dev_warn(tmp_adev->dev, "asic atom init failed!"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 46f3d1013e8c..2c94de305c69 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2439,6 +2439,18 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + /* For any RAS error that needs a full reset to + * recover, set the fatal error status + */ + if (hive) { + list_for_each_entry(remote_adev, + &hive->device_list, + gmc.xgmi.head) + amdgpu_ras_set_fed(remote_adev, + true); + } else { + amdgpu_ras_set_fed(adev, true); + } psp_fatal_error_recovery_quirk(&adev->psp); } } @@ -3440,6 +3452,26 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) return 0; } +bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (!ras) + return false; + + return atomic_read(&ras->fed); +} + +void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) +{ + struct amdgpu_ras *ras; + + ras = amdgpu_ras_get_context(adev); + if (ras) + atomic_set(&ras->fed, !!status); +} + void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index d10e5bb0e52f..e0f8ce9d8440 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -477,6 +477,8 @@ struct amdgpu_ras { wait_queue_head_t page_retirement_wq; struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; + /* Fatal error detected flag */ + atomic_t fed; }; struct ras_fs_data { @@ -873,4 +875,8 @@ void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr); + +void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); +bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); + #endif |