diff options
author | Dennis Li <Dennis.Li@amd.com> | 2021-03-06 00:30:54 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2021-03-24 05:59:52 +0300 |
commit | 88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9 (patch) | |
tree | 18987aeeed92f54b8cba5258eb2542641f667489 /drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | |
parent | 4abc2567f0ebf1c7113430e25ee960408f5ebcb1 (diff) | |
download | linux-88f8575bca5fc70ba8608cfc49811f9b4d1eb6f9.tar.xz |
drm/amdgpu: enable watchdog feature for SQ of aldebaran
SQ's watchdog timer monitors forward progress, a mask of which waves
caused the watchdog timeout is recorded into ras status registers and
then trigger a system fatal error event.
v2:
1. change *query_timeout_status to *query_sq_timeout_status.
2. move query_sq_timeout_status into amdgpu_ras_do_recovery.
3. add module parameters to enable/disable fatal error event and modify
the watchdog timer.
v3:
1. remove unused parameters of *enable_watchdog_timer
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 106 |
1 files changed, 106 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index b2e2026c3ec7..1faeae14ead9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) gfx_v9_4_2_query_ea_err_status(adev); gfx_v9_4_2_query_utc_err_status(adev); } + +void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev) +{ + uint32_t i; + uint32_t data; + + data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, + amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : + 0); + data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL, + amdgpu_watchdog_timer.period); + + mutex_lock(&adev->grbm_idx_mutex); + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { + gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff); + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data); + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +} + +static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address) +{ + WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX, + (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) | + (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) | + (address << SQ_IND_INDEX__INDEX__SHIFT) | + (SQ_IND_INDEX__FORCE_READ_MASK)); + return RREG32_SOC15(GC, 0, regSQ_IND_DATA); +} + +static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev, + uint32_t status) +{ + struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info; + uint32_t i, simd, wave; + uint32_t wave_status; + uint32_t wave_pc_lo, wave_pc_hi; + uint32_t wave_exec_lo, wave_exec_hi; + uint32_t wave_inst_dw0, wave_inst_dw1; + uint32_t wave_ib_sts; + + for (i = 0; i < 32; i++) { + if (!((i << 1) & status)) + continue; + + simd = i / cu_info->max_waves_per_simd; + wave = i % cu_info->max_waves_per_simd; + + wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS); + wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO); + wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI); + wave_exec_lo = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO); + wave_exec_hi = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI); + wave_inst_dw0 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0); + wave_inst_dw1 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1); + wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS); + + dev_info( + adev->dev, + "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n", + simd, wave, wave_status, + ((uint64_t)wave_pc_hi << 32 | wave_pc_lo), + ((uint64_t)wave_exec_hi << 32 | wave_exec_lo), + ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0), + wave_ib_sts); + } +} + +void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) +{ + uint32_t se_idx, sh_idx, cu_idx; + uint32_t status; + + mutex_lock(&adev->grbm_idx_mutex); + for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; + se_idx++) { + for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; + sh_idx++) { + for (cu_idx = 0; + cu_idx < adev->gfx.config.max_cu_per_sh; + cu_idx++) { + gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx, + cu_idx); + status = RREG32_SOC15(GC, 0, + regSQ_TIMEOUT_STATUS); + if (status != 0) { + dev_info( + adev->dev, + "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n", + se_idx, sh_idx, cu_idx); + gfx_v9_4_2_log_cu_timeout_status( + adev, status); + } + /* clear old status */ + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0); + } + } + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +}
\ No newline at end of file |