1 files changed, 61 insertions, 16 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index f28f6b4ba765..52c16bfeccaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -470,9 +470,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
 		res = __write_table_ras_info(control);
 
 	control->ras_num_recs = 0;
+	control->ras_num_bad_pages = 0;
 	control->ras_fri = 0;
 
-	amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
+	amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_bad_pages);
 
 	control->bad_channel_bitmap = 0;
 	amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
@@ -559,7 +560,7 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev)
 	if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) {
 		if (amdgpu_bad_page_threshold == -1) {
 			dev_warn(adev->dev, "RAS records:%d exceed threshold:%d",
-				con->eeprom_control.ras_num_recs, con->bad_page_cnt_threshold);
+				con->eeprom_control.ras_num_bad_pages, con->bad_page_cnt_threshold);
 			dev_warn(adev->dev,
 				"But GPU can be operated due to bad_page_threshold = -1.\n");
 			return false;
@@ -621,6 +622,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
 			       const u32 num)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
+	struct amdgpu_device *adev = to_amdgpu_device(control);
 	u32 a, b, i;
 	u8 *buf, *pp;
 	int res;
@@ -723,6 +725,12 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
 	control->ras_num_recs = 1 + (control->ras_max_record_count + b
 				     - control->ras_fri)
 		% control->ras_max_record_count;
+
+	if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
+		control->ras_num_bad_pages = control->ras_num_recs;
+	else
+		control->ras_num_bad_pages =
+			control->ras_num_recs * adev->umc.retire_unit;
 Out:
 	kfree(buf);
 	return res;
@@ -740,10 +748,10 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 	/* Modify the header if it exceeds.
 	 */
 	if (amdgpu_bad_page_threshold != 0 &&
-	    control->ras_num_recs >= ras->bad_page_cnt_threshold) {
+	    control->ras_num_bad_pages >= ras->bad_page_cnt_threshold) {
 		dev_warn(adev->dev,
 			"Saved bad pages %d reaches threshold value %d\n",
-			control->ras_num_recs, ras->bad_page_cnt_threshold);
+			control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
 		control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
 		if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
 			control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
@@ -798,9 +806,9 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
 	 */
 	if (amdgpu_bad_page_threshold != 0 &&
 	    control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
-	    control->ras_num_recs < ras->bad_page_cnt_threshold)
+	    control->ras_num_bad_pages < ras->bad_page_cnt_threshold)
 		control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
-						   control->ras_num_recs) * 100) /
+						   control->ras_num_bad_pages) * 100) /
 						   ras->bad_page_cnt_threshold;
 
 	/* Recalc the checksum.
@@ -841,7 +849,7 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
 			     const u32 num)
 {
 	struct amdgpu_device *adev = to_amdgpu_device(control);
-	int res;
+	int res, i;
 
 	if (!__is_ras_eeprom_supported(adev))
 		return 0;
@@ -855,6 +863,10 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
 		return -EINVAL;
 	}
 
+	/* set the new channel index flag */
+	for (i = 0; i < num; i++)
+		record[i].retired_page |= UMC_CHANNEL_IDX_V2;
+
 	mutex_lock(&control->ras_tbl_mutex);
 
 	res = amdgpu_ras_eeprom_append_table(control, record, num);
@@ -864,6 +876,11 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
 		amdgpu_ras_debugfs_set_ret_size(control);
 
 	mutex_unlock(&control->ras_tbl_mutex);
+
+	/* clear channel index flag, the flag is only saved on eeprom */
+	for (i = 0; i < num; i++)
+		record[i].retired_page &= ~UMC_CHANNEL_IDX_V2;
+
 	return res;
 }
 
@@ -1373,9 +1390,35 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 	}
 	control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
 
+	return 0;
+}
+
+int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
+{
+	struct amdgpu_device *adev = to_amdgpu_device(control);
+	struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+	int res;
+
+	if (!__is_ras_eeprom_supported(adev))
+		return 0;
+
+	/* Verify i2c adapter is initialized */
+	if (!adev->pm.ras_eeprom_i2c_bus || !adev->pm.ras_eeprom_i2c_bus->algo)
+		return -ENOENT;
+
+	if (!__get_eeprom_i2c_addr(adev, control))
+		return -EINVAL;
+
+	if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA)
+		control->ras_num_bad_pages = control->ras_num_recs;
+	else
+		control->ras_num_bad_pages =
+			control->ras_num_recs * adev->umc.retire_unit;
+
 	if (hdr->header == RAS_TABLE_HDR_VAL) {
 		DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
-				 control->ras_num_recs);
+				 control->ras_num_bad_pages);
 
 		if (hdr->version == RAS_TABLE_VER_V2_1) {
 			res = __read_table_ras_info(control);
@@ -1390,9 +1433,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 
 		/* Warn if we are at 90% of the threshold or above
 		 */
-		if (10 * control->ras_num_recs >= 9 * ras->bad_page_cnt_threshold)
+		if (10 * control->ras_num_bad_pages >= 9 * ras->bad_page_cnt_threshold)
 			dev_warn(adev->dev, "RAS records:%u exceeds 90%% of threshold:%d",
-					control->ras_num_recs,
+					control->ras_num_bad_pages,
 					ras->bad_page_cnt_threshold);
 	} else if (hdr->header == RAS_TABLE_HDR_BAD &&
 		   amdgpu_bad_page_threshold != 0) {
@@ -1403,10 +1446,12 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 		}
 
 		res = __verify_ras_table_checksum(control);
-		if (res)
-			DRM_ERROR("RAS Table incorrect checksum or error:%d\n",
+		if (res) {
+			dev_err(adev->dev, "RAS Table incorrect checksum or error:%d\n",
 				  res);
-		if (ras->bad_page_cnt_threshold > control->ras_num_recs) {
+			return -EINVAL;
+		}
+		if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) {
 			/* This means that, the threshold was increased since
 			 * the last time the system was booted, and now,
 			 * ras->bad_page_cnt_threshold - control->num_recs > 0,
@@ -1416,13 +1461,13 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 			dev_info(adev->dev,
 				 "records:%d threshold:%d, resetting "
 				 "RAS table header signature",
-				 control->ras_num_recs,
+				 control->ras_num_bad_pages,
 				 ras->bad_page_cnt_threshold);
 			res = amdgpu_ras_eeprom_correct_header_tag(control,
 								   RAS_TABLE_HDR_VAL);
 		} else {
 			dev_err(adev->dev, "RAS records:%d exceed threshold:%d",
-				control->ras_num_recs, ras->bad_page_cnt_threshold);
+				control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
 			if (amdgpu_bad_page_threshold == -1) {
 				dev_warn(adev->dev, "GPU will be initialized due to bad_page_threshold = -1.");
 				res = 0;
@@ -1431,7 +1476,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)
 				dev_err(adev->dev,
 					"RAS records:%d exceed threshold:%d, "
 					"GPU will not be initialized. Replace this GPU or increase the threshold",
-					control->ras_num_recs, ras->bad_page_cnt_threshold);
+					control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
 			}
 		}
 	} else {