diff options
author | Yunxiang Li <Yunxiang.Li@amd.com> | 2024-04-22 21:59:02 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2024-05-02 22:40:44 +0300 |
commit | f4322b9f8ad5f9f62add288c785d2e10bb6a5efe (patch) | |
tree | 16568819259068d87856a8dc5cde125945735d2d | |
parent | f5007c67fc77ec555cf824fb8c2038a834201b38 (diff) | |
download | linux-f4322b9f8ad5f9f62add288c785d2e10bb6a5efe.tar.xz |
drm/amdgpu: Fix two reset triggered in a row
Some times a hang GPU causes multiple reset sources to schedule resets.
The second source will be able to trigger an unnecessary reset if they
schedule after we call amdgpu_device_stop_pending_resets.
Move amdgpu_device_stop_pending_resets to after the reset is done. Since
at this point the GPU is supposedly in a good state, any reset scheduled
after this point would be a legitimate reset.
Remove unnecessary and incorrect checks for amdgpu_in_reset that was
kinda serving this purpose.
Signed-off-by: Yunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 |
5 files changed, 14 insertions, 13 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 861ccff78af9..8befd10bf007 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5070,8 +5070,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); - amdgpu_device_stop_pending_resets(adev); - if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); else @@ -5823,13 +5821,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ r, adev_to_drm(tmp_adev)->unique); tmp_adev->asic_reset_res = r; } - - if (!amdgpu_sriov_vf(tmp_adev)) - /* - * Drop all pending non scheduler resets. Scheduler resets - * were already dropped during drm_sched_stop - */ - amdgpu_device_stop_pending_resets(tmp_adev); } /* Actual ASIC resets if needed.*/ @@ -5851,6 +5842,16 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ goto retry; } + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + /* + * Drop any pending non scheduler resets queued before reset is done. + * Any reset scheduled after this point would be valid. Scheduler resets + * were already dropped during drm_sched_stop and no new ones can come + * in before drm_sched_start. + */ + amdgpu_device_stop_pending_resets(tmp_adev); + } + skip_hw_reset: /* Post ASIC reset for all devs .*/ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index f04b89955fea..d98d619fba97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -599,7 +599,7 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) if (ret) { adev->virt.vf2pf_update_retry_cnt++; if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && - amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { + amdgpu_sriov_runtime(adev)) { amdgpu_ras_set_fed(adev, true); if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work)) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 0c7275bca8f7..c5ba9c4757a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index 1cb98e6c09c8..fb7cf4214e3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -358,7 +358,7 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 59f53c743362..14a065516ae4 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -560,7 +560,7 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, r = xgpu_vi_mailbox_rcv_msg(adev, IDH_FLR_NOTIFICATION); /* only handle FLR_NOTIFY now */ - if (!r && !amdgpu_in_reset(adev)) + if (!r) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", |