diff options
| author | Jani Nikula <jani.nikula@intel.com> | 2025-06-09 12:40:46 +0300 | 
|---|---|---|
| committer | Jani Nikula <jani.nikula@intel.com> | 2025-06-09 12:40:46 +0300 | 
| commit | 34c55367af96f62e89221444f04487440ebc6487 (patch) | |
| tree | fdb36ba67d7dea09455b55037e26043b7e051ef9 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
| parent | 7247efca0dcbc8ac6147db9200ed1549c0662465 (diff) | |
| parent | 19272b37aa4f83ca52bdf9c16d5d81bdd1354494 (diff) | |
| download | linux-34c55367af96f62e89221444f04487440ebc6487.tar.xz | |
Merge drm/drm-next into drm-intel-next
Sync to v6.16-rc1, among other things to get the fixed size GENMASK_U*()
and BIT_U*() macros.
Signed-off-by: Jani Nikula <jani.nikula@intel.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 | 
1 files changed, 69 insertions, 30 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f40b35f7f679..de0944947eaf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2859,6 +2859,15 @@ static int __amdgpu_ras_convert_rec_array_from_rom(struct amdgpu_device *adev,  				return -EINVAL;  		}  	} else { +		if (bps[0].address == 0) { +			/* for specific old eeprom data, mca address is not stored, +			 * calc it from pa +			 */ +			if (amdgpu_umc_pa2mca(adev, bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT, +				&(bps[0].address), AMDGPU_NPS1_PARTITION_MODE)) +				return -EINVAL; +		} +  		if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) {  			if (nps == AMDGPU_NPS1_PARTITION_MODE)  				memcpy(err_data->err_addr, bps, @@ -2886,9 +2895,22 @@ static int __amdgpu_ras_convert_rec_from_rom(struct amdgpu_device *adev,  				bps->retired_page << AMDGPU_GPU_PAGE_SHIFT))  			return -EINVAL;  	} else { -		if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) -			return -EINVAL; +		if (bps->address) { +			if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) +				return -EINVAL; +		} else { +			/* for specific old eeprom data, mca address is not stored, +			 * calc it from pa +			 */ +			if (amdgpu_umc_pa2mca(adev, bps->retired_page << AMDGPU_GPU_PAGE_SHIFT, +				&(bps->address), AMDGPU_NPS1_PARTITION_MODE)) +				return -EINVAL; + +			if (amdgpu_ras_mca2pa(adev, bps, err_data)) +				return -EOPNOTSUPP; +		}  	} +  	return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr,  									adev->umc.retire_unit);  } @@ -2903,7 +2925,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  			&adev->psp.ras_context.ras->eeprom_control;  	enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE;  	int ret = 0; -	uint32_t i; +	uint32_t i = 0;  	if (!con || !con->eh_data || !bps || pages <= 0)  		return 0; @@ -2924,34 +2946,36 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,  	mutex_lock(&con->recovery_lock);  	if (from_rom) { -		for (i = 0; i < pages; i++) { -			if (control->ras_num_recs - i >= adev->umc.retire_unit) { -				if ((bps[i].address == bps[i + 1].address) && -				    (bps[i].mem_channel == bps[i + 1].mem_channel)) { -					//deal with retire_unit records a time -					ret = __amdgpu_ras_convert_rec_array_from_rom(adev, -									&bps[i], &err_data, nps); -					if (ret) -						goto free; -					i += (adev->umc.retire_unit - 1); +		/* there is no pa recs in V3, so skip pa recs processing */ +		if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { +			for (i = 0; i < pages; i++) { +				if (control->ras_num_recs - i >= adev->umc.retire_unit) { +					if ((bps[i].address == bps[i + 1].address) && +						(bps[i].mem_channel == bps[i + 1].mem_channel)) { +						/* deal with retire_unit records a time */ +						ret = __amdgpu_ras_convert_rec_array_from_rom(adev, +										&bps[i], &err_data, nps); +						if (ret) +							control->ras_num_bad_pages -= adev->umc.retire_unit; +						i += (adev->umc.retire_unit - 1); +					} else { +						break; +					}  				} else {  					break;  				} -			} else { -				break;  			}  		}  		for (; i < pages; i++) {  			ret = __amdgpu_ras_convert_rec_from_rom(adev,  				&bps[i], &err_data, nps);  			if (ret) -				goto free; +				control->ras_num_bad_pages -= adev->umc.retire_unit;  		}  	} else {  		ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages);  	} -free:  	if (from_rom)  		kfree(err_data.err_addr);  	mutex_unlock(&con->recovery_lock); @@ -3040,21 +3064,28 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)  		dev_err(adev->dev, "Failed to load EEPROM table records!");  	} else {  		if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { -			for (i = 0; i < control->ras_num_recs; i++) { -				if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { -					if ((bps[i].address == bps[i + 1].address) && -						(bps[i].mem_channel == bps[i + 1].mem_channel)) { -						control->ras_num_pa_recs += adev->umc.retire_unit; -						i += (adev->umc.retire_unit - 1); +			/*In V3, there is no pa recs, and some cases(when address==0) may be parsed +			as pa recs, so add verion check to avoid it. +			*/ +			if (control->tbl_hdr.version < RAS_TABLE_VER_V3) { +				for (i = 0; i < control->ras_num_recs; i++) { +					if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { +						if ((bps[i].address == bps[i + 1].address) && +							(bps[i].mem_channel == bps[i + 1].mem_channel)) { +							control->ras_num_pa_recs += adev->umc.retire_unit; +							i += (adev->umc.retire_unit - 1); +						} else { +							control->ras_num_mca_recs += +										(control->ras_num_recs - i); +							break; +						}  					} else { -						control->ras_num_mca_recs += -									(control->ras_num_recs - i); +						control->ras_num_mca_recs += (control->ras_num_recs - i);  						break;  					} -				} else { -					control->ras_num_mca_recs += (control->ras_num_recs - i); -					break;  				} +			} else { +				control->ras_num_mca_recs = control->ras_num_recs;  			}  		} @@ -3463,6 +3494,10 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)  	if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)  		control->ras_num_pa_recs = control->ras_num_recs; +	if (adev->umc.ras && +	    adev->umc.ras->get_retire_flip_bits) +		adev->umc.ras->get_retire_flip_bits(adev); +  	if (control->ras_num_recs) {  		ret = amdgpu_ras_load_bad_pages(adev);  		if (ret) @@ -3694,7 +3729,8 @@ static void amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev  		 */  		if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, 0) ||  		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 0) || -		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3)) +		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, 3) || +		    amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(5, 0, 1))  			adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |  						 1 << AMDGPU_RAS_BLOCK__JPEG);  		else @@ -4484,8 +4520,11 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)  		enum ras_event_type type = RAS_EVENT_TYPE_FATAL;  		u64 event_id; -		if (amdgpu_ras_mark_ras_event(adev, type)) +		if (amdgpu_ras_mark_ras_event(adev, type)) { +			dev_err(adev->dev, +				"uncorrectable hardware error (ERREVENT_ATHUB_INTERRUPT) detected!\n");  			return; +		}  		event_id = amdgpu_ras_acquire_event_id(adev, type);  | 
