summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLijo Lazar <lijo.lazar@amd.com>2026-02-24 07:48:51 +0300
committerAlex Deucher <alexander.deucher@amd.com>2026-02-26 00:56:33 +0300
commitbb71362182e59caa227e4192da5a612b09349696 (patch)
tree8a959209b139d0a54dd77f66db73cc9476234f51
parent9eaaae4c4b7a999d59e534c6e4218b175274ff33 (diff)
downloadlinux-bb71362182e59caa227e4192da5a612b09349696.tar.xz
drm/amdgpu: Fix error handling in slot reset
If the device has not recovered after slot reset is called, it goes to out label for error handling. There it could make decision based on uninitialized hive pointer and could result in accessing an uninitialized list. Initialize the list and hive properly so that it handles the error situation and also releases the reset domain lock which is acquired during error_detected callback. Fixes: 732c6cefc1ec ("drm/amdgpu: Replace tmp_adev with hive in amdgpu_pci_slot_reset") Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Ce Sun <cesun102@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c17
1 files changed, 10 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 0acddcb04730..d5bf62bb4602 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -7043,6 +7043,15 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
dev_info(adev->dev, "PCI error: slot reset callback!!\n");
memset(&reset_context, 0, sizeof(reset_context));
+ INIT_LIST_HEAD(&device_list);
+ hive = amdgpu_get_xgmi_hive(adev);
+ if (hive) {
+ mutex_lock(&hive->hive_lock);
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_add_tail(&tmp_adev->reset_list, &device_list);
+ } else {
+ list_add_tail(&adev->reset_list, &device_list);
+ }
if (adev->pcie_reset_ctx.swus)
link_dev = adev->pcie_reset_ctx.swus;
@@ -7083,19 +7092,13 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
reset_context.reset_req_dev = adev;
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
- INIT_LIST_HEAD(&device_list);
- hive = amdgpu_get_xgmi_hive(adev);
if (hive) {
- mutex_lock(&hive->hive_lock);
reset_context.hive = hive;
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
tmp_adev->pcie_reset_ctx.in_link_reset = true;
- list_add_tail(&tmp_adev->reset_list, &device_list);
- }
} else {
set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
- list_add_tail(&adev->reset_list, &device_list);
}
r = amdgpu_device_asic_reset(adev, &device_list, &reset_context);