diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 41 |
1 files changed, 23 insertions, 18 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 52c16bfeccaa..723c655bb4d5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -558,16 +558,17 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) return false; if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { - if (amdgpu_bad_page_threshold == -1) { + if (con->eeprom_control.ras_num_bad_pages > con->bad_page_cnt_threshold) dev_warn(adev->dev, "RAS records:%d exceed threshold:%d", - con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); + con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { dev_warn(adev->dev, - "But GPU can be operated due to bad_page_threshold = -1.\n"); + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures.\n"); return false; } else { - dev_warn(adev->dev, "This GPU is in BAD status."); - dev_warn(adev->dev, "Please retire it or set a larger " - "threshold value when reloading driver.\n"); + dev_warn(adev->dev, + "Please consider adjusting the customized threshold.\n"); return true; } } @@ -758,7 +759,8 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control) control->tbl_rai.health_percent = 0; } - if (amdgpu_bad_page_threshold != -1) + if ((amdgpu_bad_page_threshold != -1) && + (amdgpu_bad_page_threshold != -2)) ras->is_rma = true; /* ignore the -ENOTSUPP return value */ @@ -1428,8 +1430,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) res = __verify_ras_table_checksum(control); if (res) - DRM_ERROR("RAS table incorrect checksum or error:%d\n", - res); + dev_err(adev->dev, + "RAS table incorrect checksum or error:%d\n", + res); /* Warn if we are at 90% of the threshold or above */ @@ -1447,8 +1450,9 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) res = __verify_ras_table_checksum(control); if (res) { - dev_err(adev->dev, "RAS Table incorrect checksum or error:%d\n", - res); + dev_err(adev->dev, + "RAS Table incorrect checksum or error:%d\n", + res); return -EINVAL; } if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) { @@ -1466,17 +1470,18 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) res = amdgpu_ras_eeprom_correct_header_tag(control, RAS_TABLE_HDR_VAL); } else { - dev_err(adev->dev, "RAS records:%d exceed threshold:%d", + dev_warn(adev->dev, + "RAS records:%d exceed threshold:%d\n", control->ras_num_bad_pages, ras->bad_page_cnt_threshold); - if (amdgpu_bad_page_threshold == -1) { - dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1."); + if ((amdgpu_bad_page_threshold == -1) || + (amdgpu_bad_page_threshold == -2)) { res = 0; + dev_warn(adev->dev, + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); } else { ras->is_rma = true; - dev_err(adev->dev, - "RAS records:%d exceed threshold:%d, " - "GPU will not be initialized. Replace this GPU or increase the threshold", - control->ras_num_bad_pages, ras->bad_page_cnt_threshold); + dev_warn(adev->dev, + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); } } } else { |