diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/umc_v12_0.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 97 |
1 files changed, 22 insertions, 75 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index bfe61d86ee6c..9dbb13adb661 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -29,6 +29,7 @@ #include "mp/mp_13_0_6_sh_mask.h" #define MAX_ECC_NUM_PER_RETIREMENT 32 +#define DELAYED_TIME_FOR_GPU_RESET 1000 //ms static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, uint32_t node_inst, @@ -71,7 +72,7 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { - dev_info(adev->dev, + dev_dbg(adev->dev, "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n", mc_umc_status, REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val), @@ -376,77 +377,6 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, return 0; } -#ifdef TO_BE_REMOVED -static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev, - void *ras_error_status) -{ - struct ras_query_context qctx; - - memset(&qctx, 0, sizeof(qctx)); - qctx.event_id = amdgpu_ras_acquire_event_id(adev, amdgpu_ras_intr_triggered() ? - RAS_EVENT_TYPE_ISR : RAS_EVENT_TYPE_INVALID); - - amdgpu_mca_smu_log_ras_error(adev, - AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_CE, ras_error_status, &qctx); - amdgpu_mca_smu_log_ras_error(adev, - AMDGPU_RAS_BLOCK__UMC, AMDGPU_MCA_ERROR_TYPE_UE, ras_error_status, &qctx); -} - -static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *adev, - void *ras_error_status) -{ - struct ras_err_node *err_node; - uint64_t mc_umc_status; - struct ras_err_info *err_info; - struct ras_err_addr *mca_err_addr, *tmp; - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - struct ta_ras_query_address_input addr_in; - - for_each_ras_error(err_node, err_data) { - err_info = &err_node->err_info; - if (list_empty(&err_info->err_addr_list)) - continue; - - addr_in.ma.node_inst = err_info->mcm_info.die_id; - addr_in.ma.socket_id = err_info->mcm_info.socket_id; - - list_for_each_entry_safe(mca_err_addr, tmp, &err_info->err_addr_list, node) { - mc_umc_status = mca_err_addr->err_status; - if (mc_umc_status && - (umc_v12_0_is_uncorrectable_error(adev, mc_umc_status) || - umc_v12_0_is_deferred_error(adev, mc_umc_status))) { - uint64_t mca_addr, err_addr, mca_ipid; - uint32_t InstanceIdLo; - - mca_addr = mca_err_addr->err_addr; - mca_ipid = mca_err_addr->err_ipid; - - err_addr = REG_GET_FIELD(mca_addr, - MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - InstanceIdLo = REG_GET_FIELD(mca_ipid, MCMP1_IPIDT0, InstanceIdLo); - - addr_in.ma.err_addr = err_addr; - addr_in.ma.ch_inst = MCA_IPID_LO_2_UMC_CH(InstanceIdLo); - addr_in.ma.umc_inst = MCA_IPID_LO_2_UMC_INST(InstanceIdLo); - - dev_info(adev->dev, "UMC:IPID:0x%llx, aid:%d, inst:%d, ch:%d, err_addr:0x%llx\n", - mca_ipid, - err_info->mcm_info.die_id, - MCA_IPID_LO_2_UMC_INST(InstanceIdLo), - MCA_IPID_LO_2_UMC_CH(InstanceIdLo), - err_addr); - - umc_v12_0_convert_error_address(adev, - err_data, &addr_in); - } - - /* Delete error address node from list and free memory */ - amdgpu_ras_del_mca_err_addr(err_info, mca_err_addr); - } - } -} -#endif - static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, void *ras_error_status) { @@ -575,7 +505,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, err_addr = REG_GET_FIELD(addr, MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); - dev_info(adev->dev, + dev_dbg(adev->dev, "UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n", ipid, MCA_IPID_2_SOCKET_ID(ipid), @@ -628,7 +558,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { if (ret == -EEXIST) - con->umc_ecc_log.de_updated = true; + con->umc_ecc_log.de_queried_count++; else dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); @@ -637,7 +567,24 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, return ret; } - con->umc_ecc_log.de_updated = true; + con->umc_ecc_log.de_queried_count++; + + /* The problem case is as follows: + * 1. GPU A triggers a gpu ras reset, and GPU A drives + * GPU B to also perform a gpu ras reset. + * 2. After gpu B ras reset started, gpu B queried a DE + * data. Since the DE data was queried in the ras reset + * thread instead of the page retirement thread, bad + * page retirement work would not be triggered. Then + * even if all gpu resets are completed, the bad pages + * will be cached in RAM until GPU B's bad page retirement + * work is triggered again and then saved to eeprom. + * Trigger delayed work to save the bad pages to eeprom in time + * after gpu ras reset is completed. + */ + if (amdgpu_ras_in_recovery(adev)) + schedule_delayed_work(&con->page_retirement_dwork, + msecs_to_jiffies(DELAYED_TIME_FOR_GPU_RESET)); return 0; } |