diff options
author | Yunxiang Li <Yunxiang.Li@amd.com> | 2024-04-22 21:44:38 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-05-02 22:40:50 +0300 |
commit | 25c01191c2555351922e5515b6b6d31357975031 (patch) | |
tree | 6ae9d6296928adda447c2bb6cb2bbf16284559eb | |
parent | f4322b9f8ad5f9f62add288c785d2e10bb6a5efe (diff) | |
download | linux-25c01191c2555351922e5515b6b6d31357975031.tar.xz |
drm/amdgpu: Add reset_context flag for host FLR
There are other reset sources that pass NULL as the job pointer, such as
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if
the FLR comes from the host does not work.
Add a flag in reset_context to explicitly mark host triggered reset, and
set this flag when we receive host reset notification.
Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: Emily Deng <Emily.Deng@amd.com>
Reviewed-by: Zhigang Luo <zhigang.luo@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 |
5 files changed, 12 insertions, 5 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8befd10bf007..33c889c027a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * * @adev: amdgpu_device pointer - * @from_hypervisor: request from hypervisor + * @reset_context: amdgpu reset context pointer * * do VF FLR and reinitialize Asic * return 0 means succeeded otherwise failed */ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, - bool from_hypervisor) + struct amdgpu_reset_context *reset_context) { int r; struct amdgpu_hive_info *hive = NULL; @@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); - if (from_hypervisor) + if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { + clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); r = amdgpu_virt_request_full_gpu(adev, true); - else + } else { r = amdgpu_virt_reset_gpu(adev); + } if (r) return r; + amdgpu_ras_set_fed(adev, false); amdgpu_irq_gpu_reset_resume_helper(adev); @@ -5826,7 +5829,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { - r = amdgpu_device_reset_sriov(adev, job ? false : true); + r = amdgpu_device_reset_sriov(adev, reset_context); if (r) adev->asic_reset_res = r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index b11d190ece53..5a9cc043b858 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_COREDUMP = 2, + AMDGPU_HOST_FLR = 3, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index c5ba9c4757a8..f4c47492e0cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -292,6 +292,7 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index fb7cf4214e3a..37b49a5ed2a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -328,6 +328,7 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 14a065516ae4..78cd07744ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } |