summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v12_0.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/umc_v12_0.c97
1 files changed, 22 insertions, 75 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index bfe61d86ee6c..9dbb13adb661 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -29,6 +29,7 @@
#include "mp/mp_13_0_6_sh_mask.h"
#define MAX_ECC_NUM_PER_RETIREMENT 32
+#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
uint32_t node_inst,
@@ -71,7 +72,7 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev)
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status)
{
- dev_info(adev->dev,
+ dev_dbg(adev->dev,
"MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n",
mc_umc_status,
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
@@ -376,77 +377,6 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
return 0;
}
-#ifdef TO_BE_REMOVED
-static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- struct ras_query_context qctx;
-
- memset(&qctx, 0, sizeof(qctx));
- qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ?
- RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID);
-
- amdgpu_mca_smu_log_ras_error(adev,
- AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx);
- amdgpu_mca_smu_log_ras_error(adev,
- AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx);
-}
-
-static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev,
- void *ras_error_status)
-{
- struct ras_err_node *err_node;
- uint64_t mc_umc_status;
- struct ras_err_info *err_info;
- struct ras_err_addr *mca_err_addr, *tmp;
- struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
- struct ta_ras_query_address_input addr_in;
-
- for_each_ras_error(err_node, err_data) {
- err_info = &err_node->err_info;
- if (list_empty(&err_info->err_addr_list))
- continue;
-
- addr_in.ma.node_inst = err_info->mcm_info.die_id;
- addr_in.ma.socket_id = err_info->mcm_info.socket_id;
-
- list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) {
- mc_umc_status = mca_err_addr->err_status;
- if (mc_umc_status &&
- (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) ||
- umc_v12_0_is_deferred_error(adev, mc_umc_status))) {
- uint64_t mca_addr, err_addr, mca_ipid;
- uint32_t InstanceIdLo;
-
- mca_addr = mca_err_addr->err_addr;
- mca_ipid = mca_err_addr->err_ipid;
-
- err_addr = REG_GET_FIELD(mca_addr,
- MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
- InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo);
-
- addr_in.ma.err_addr = err_addr;
- addr_in.ma.ch_inst = MCA_IPID_LO_2_UMC_CH(InstanceIdLo);
- addr_in.ma.umc_inst = MCA_IPID_LO_2_UMC_INST(InstanceIdLo);
-
- dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n",
- mca_ipid,
- err_info->mcm_info.die_id,
- MCA_IPID_LO_2_UMC_INST(InstanceIdLo),
- MCA_IPID_LO_2_UMC_CH(InstanceIdLo),
- err_addr);
-
- umc_v12_0_convert_error_address(adev,
- err_data, &addr_in);
- }
-
- /* Delete error address node from list and free memory */
- amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr);
- }
- }
-}
-#endif
-
static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void *ras_error_status)
{
@@ -575,7 +505,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
err_addr = REG_GET_FIELD(addr,
MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
- dev_info(adev->dev,
+ dev_dbg(adev->dev,
"UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
ipid,
MCA_IPID_2_SOCKET_ID(ipid),
@@ -628,7 +558,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
if (ret) {
if (ret == -EEXIST)
- con->umc_ecc_log.de_updated = true;
+ con->umc_ecc_log.de_queried_count++;
else
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
@@ -637,7 +567,24 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
return ret;
}
- con->umc_ecc_log.de_updated = true;
+ con->umc_ecc_log.de_queried_count++;
+
+ /* The problem case is as follows:
+ * 1. GPU A triggers a gpu ras reset, and GPU A drives
+ * GPU B to also perform a gpu ras reset.
+ * 2. After gpu B ras reset started, gpu B queried a DE
+ * data. Since the DE data was queried in the ras reset
+ * thread instead of the page retirement thread, bad
+ * page retirement work would not be triggered. Then
+ * even if all gpu resets are completed, the bad pages
+ * will be cached in RAM until GPU B's bad page retirement
+ * work is triggered again and then saved to eeprom.
+ * Trigger delayed work to save the bad pages to eeprom in time
+ * after gpu ras reset is completed.
+ */
+ if (amdgpu_ras_in_recovery(adev))
+ schedule_delayed_work(&con->page_retirement_dwork,
+ msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET));
return 0;
}