summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHawking Zhang <Hawking.Zhang@amd.com>2024-04-16 09:25:26 +0300
committerAlex Deucher <alexander.deucher@amd.com>2024-04-19 06:46:23 +0300
commit5e984b0a3d2a5e0e27cb6c194058d6d9859911d2 (patch)
tree083f9d99ce9aea3c96179be6c469b36705032d09
parent5adcd78fa2bcc458f9786067bcf4a15f9a3f49c9 (diff)
downloadlinux-5e984b0a3d2a5e0e27cb6c194058d6d9859911d2.tar.xz
drm/amdgpu: Use driver mode reset for data poison
mode-2 reset is the only reliable method that can get GC/SDMA back when poison is consumed. mmhub requires mode-1 reset. Signed-off-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c27
1 files changed, 8 insertions, 19 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index c368c70df3f4..c3beb872adf8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id)
{
enum amdgpu_ras_block block = 0;
- int old_poison, ret = -EINVAL;
+ int old_poison;
uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2:
- ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX;
- if (ret)
- reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+ reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1:
- ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB;
- if (ret)
- reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
+ reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break;
case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1:
@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break;
default:
- break;
+ dev_warn(dev->adev->dev,
+ "client %d does not support poison consumption\n", client_id);
+ return;
}
kfd_signal_poison_consumed_event(dev, pasid);
- /* resetting queue passes, do page retirement without gpu reset
- * resetting queue fails, fallback to gpu reset solution
- */
- if (!ret)
- dev_warn(dev->adev->dev,
- "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
- client_id);
- else
- dev_warn(dev->adev->dev,
- "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
- client_id);
+ dev_warn(dev->adev->dev,
+ "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
}