diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 85 |
1 files changed, 64 insertions, 21 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dc07936d2fcb..540817e296da 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1107,6 +1107,9 @@ static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev, err_info->de_count, blk_name); } } else { + if (adev->debug_disable_ce_logs) + return; + for_each_ras_error(err_node, err_data) { err_info = &err_node->err_info; mcm_info = &err_info->mcm_info; @@ -2124,7 +2127,7 @@ static int amdgpu_ras_fs_init(struct amdgpu_device *adev) con->badpages_attr = bin_attr_gpu_vram_bad_pages; sysfs_bin_attr_init(&con->badpages_attr); bin_attrs[0] = &con->badpages_attr; - group.bin_attrs_new = bin_attrs; + group.bin_attrs = bin_attrs; } r = sysfs_create_group(&adev->dev->kobj, &group); @@ -2854,11 +2857,27 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev, if (amdgpu_umc_pages_in_a_row(adev, err_data, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) return -EINVAL; + for (i = 0; i < adev->umc.retire_unit; i++) { + err_data->err_addr[i].address = bps[0].address; + err_data->err_addr[i].mem_channel = bps[0].mem_channel; + err_data->err_addr[i].bank = bps[0].bank; + err_data->err_addr[i].err_type = bps[0].err_type; + err_data->err_addr[i].mcumc_id = bps[0].mcumc_id; + } } else { if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) return -EINVAL; } } else { + if (bps[0].address == 0) { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + } + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { if (nps == AMDGPU_NPS1_PARTITION_MODE) memcpy(err_data->err_addr, bps, @@ -2876,6 +2895,7 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, struct eeprom_table_record *bps, struct ras_err_data *err_data, enum amdgpu_memory_partition nps) { + int i = 0; enum amdgpu_memory_partition save_nps; save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; @@ -2885,9 +2905,28 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev, if (amdgpu_umc_pages_in_a_row(adev, err_data, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) return -EINVAL; + for (i = 0; i < adev->umc.retire_unit; i++) { + err_data->err_addr[i].address = bps->address; + err_data->err_addr[i].mem_channel = bps->mem_channel; + err_data->err_addr[i].bank = bps->bank; + err_data->err_addr[i].err_type = bps->err_type; + err_data->err_addr[i].mcumc_id = bps->mcumc_id; + } } else { - if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) - return -EINVAL; + if (bps->address) { + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) + return -EINVAL; + } else { + /* for specific old eeprom data, mca address is not stored, + * calc it from pa + */ + if (amdgpu_umc_pa2mca(adev, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, + &(bps->address), AMDGPU_NPS1_PARTITION_MODE)) + return -EINVAL; + + if (amdgpu_ras_mca2pa(adev, bps, err_data)) + return -EOPNOTSUPP; + } } return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, @@ -2982,6 +3021,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, return 0; } + if (!con->eeprom_control.is_eeprom_valid) { + dev_warn(adev->dev, + "Failed to save EEPROM table data because of EEPROM data corruption!"); + if (new_cnt) + *new_cnt = 0; + + return 0; + } + mutex_lock(&con->recovery_lock); control = &con->eeprom_control; data = con->eh_data; @@ -3273,7 +3321,6 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, uint64_t de_queried_count; uint32_t new_detect_count, total_detect_count; uint32_t need_query_count = poison_creation_count; - bool query_data_timeout = false; enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; memset(&info, 0, sizeof(info)); @@ -3302,21 +3349,13 @@ static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev, timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; if (timeout) { - if (!--timeout) { - query_data_timeout = true; + if (!--timeout) break; - } msleep(1); } } } while (total_detect_count < need_query_count); - if (query_data_timeout) { - dev_warn(adev->dev, "Can't find deferred error! count: %u\n", - (need_query_count - total_detect_count)); - return -ENOENT; - } - if (total_detect_count) schedule_delayed_work(&ras->page_retirement_dwork, 0); @@ -3467,8 +3506,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) control = &con->eeprom_control; ret = amdgpu_ras_eeprom_init(control); - if (ret) - return ret; + control->is_eeprom_valid = !ret; if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) control->ras_num_pa_recs = control->ras_num_recs; @@ -3477,10 +3515,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) adev->umc.ras->get_retire_flip_bits) adev->umc.ras->get_retire_flip_bits(adev); - if (control->ras_num_recs) { + if (control->ras_num_recs && control->is_eeprom_valid) { ret = amdgpu_ras_load_bad_pages(adev); - if (ret) - return ret; + if (ret) { + control->is_eeprom_valid = false; + return 0; + } amdgpu_dpm_send_hbm_bad_pages_num( adev, control->ras_num_bad_pages); @@ -3499,7 +3539,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n"); } - return ret; + return 0; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info) @@ -3708,7 +3748,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev */ if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) || amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || - amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1)) adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | 1 << AMDGPU_RAS_BLOCK__JPEG); else @@ -4392,8 +4433,10 @@ void amdgpu_ras_clear_err_state(struct amdgpu_device *adev) struct amdgpu_ras *ras; ras = amdgpu_ras_get_context(adev); - if (ras) + if (ras) { ras->ras_err_state = 0; + ras->gpu_reset_flags = 0; + } } void amdgpu_ras_set_err_poison(struct amdgpu_device *adev, |