summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
diff options
context:
space:
mode:
authorArnaldo Carvalho de Melo <acme@redhat.com>2021-11-06 21:49:33 +0300
committerArnaldo Carvalho de Melo <acme@redhat.com>2021-11-06 21:49:33 +0300
commit7f9f879243d6cf5d2d60d12065e93189cc343387 (patch)
treee74619a16f8291f7225ee9877cf93e530243e0d3 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
parent32f7aa2731b24ad8393f26d63df959d74844345f (diff)
parentfe91c4725aeed35023ba4f7a1e1adfebb6878c23 (diff)
downloadlinux-7f9f879243d6cf5d2d60d12065e93189cc343387.tar.xz
Merge remote-tracking branch 'torvalds/master' into perf/core
To pick up some tools/perf/ patches that went via tip/perf/core, such as: tools/perf: Add mem_hops field in perf_mem_data_src structure Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c22
1 files changed, 18 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 98732518543e..05117eda105b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1077,6 +1077,13 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
if (res)
DRM_ERROR("RAS table incorrect checksum or error:%d\n",
res);
+
+ /* Warn if we are at 90% of the threshold or above
+ */
+ if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
+ dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
+ control->ras_num_recs,
+ ras->bad_page_cnt_threshold);
} else if (hdr->header == RAS_TABLE_HDR_BAD &&
amdgpu_bad_page_threshold != 0) {
res = __verify_ras_table_checksum(control);
@@ -1098,11 +1105,18 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
res = amdgpu_ras_eeprom_correct_header_tag(control,
RAS_TABLE_HDR_VAL);
} else {
- *exceed_err_limit = true;
- dev_err(adev->dev,
- "RAS records:%d exceed threshold:%d, "
- "maybe retire this GPU?",
+ dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
control->ras_num_recs, ras->bad_page_cnt_threshold);
+ if (amdgpu_bad_page_threshold == -2) {
+ dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -2.");
+ res = 0;
+ } else {
+ *exceed_err_limit = true;
+ dev_err(adev->dev,
+ "RAS records:%d exceed threshold:%d, "
+ "GPU will not be initialized. Replace this GPU or increase the threshold",
+ control->ras_num_recs, ras->bad_page_cnt_threshold);
+ }
}
} else {
DRM_INFO("Creating a new EEPROM table");