diff options
| author | YiPeng Chai <YiPeng.Chai@amd.com> | 2025-07-24 10:34:29 +0300 | 
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2025-07-28 23:40:06 +0300 | 
| commit | f3486918979030f8982e1af901561dbd6e2cd1bc (patch) | |
| tree | b21eabca436b6d0c3a5a1e3a813bf9164b0ebcc8 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
| parent | d45c5e6845a76169ef3d6076f0f04487e5776905 (diff) | |
| download | linux-f3486918979030f8982e1af901561dbd6e2cd1bc.tar.xz | |
drm/amdgpu: support ras critical address check
Support ras critical address check.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 89 | 
1 files changed, 89 insertions, 0 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d49671a2ecd5..b75a0dc22ace 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -143,6 +143,10 @@ static int amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con,  				uint64_t addr);  static int amdgpu_ras_check_bad_page(struct amdgpu_device *adev,  				uint64_t addr); + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev); +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev); +  #ifdef CONFIG_X86_MCE_AMD  static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device *adev);  struct mce_notifier_adev_list { @@ -3728,6 +3732,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)  	kfree(data);  	mutex_unlock(&con->recovery_lock); +	amdgpu_ras_critical_region_init(adev); +  	return 0;  }  /* recovery end */ @@ -4157,6 +4163,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev)  	con->init_task_pid = task_pid_nr(current);  	get_task_comm(con->init_task_comm, current); +	mutex_init(&con->critical_region_lock); +	INIT_LIST_HEAD(&con->critical_region_head); +  	dev_info(adev->dev, "RAS INFO: ras initialized successfully, "  		 "hardware ability[%x] ras_mask[%x]\n",  		 adev->ras_hw_enabled, adev->ras_enabled); @@ -4436,6 +4445,9 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)  	if (!adev->ras_enabled || !con)  		return 0; +	amdgpu_ras_critical_region_fini(adev); +	mutex_destroy(&con->critical_region_lock); +  	list_for_each_entry_safe(ras_node, tmp, &adev->ras_list, node) {  		if (ras_node->ras_obj) {  			obj = ras_node->ras_obj; @@ -5380,3 +5392,80 @@ bool amdgpu_ras_is_rma(struct amdgpu_device *adev)  	return con->is_rma;  } + +int amdgpu_ras_add_critical_region(struct amdgpu_device *adev, +			struct amdgpu_bo *bo) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct amdgpu_vram_mgr_resource *vres; +	struct ras_critical_region *region; +	struct drm_buddy_block *block; +	int ret = 0; + +	if (!bo || !bo->tbo.resource) +		return -EINVAL; + +	vres = to_amdgpu_vram_mgr_resource(bo->tbo.resource); + +	mutex_lock(&con->critical_region_lock); + +	/* Check if the bo had been recorded */ +	list_for_each_entry(region, &con->critical_region_head, node) +		if (region->bo == bo) +			goto out; + +	/* Record new critical amdgpu bo */ +	list_for_each_entry(block, &vres->blocks, link) { +		region = kzalloc(sizeof(*region), GFP_KERNEL); +		if (!region) { +			ret = -ENOMEM; +			goto out; +		} +		region->bo = bo; +		region->start = amdgpu_vram_mgr_block_start(block); +		region->size = amdgpu_vram_mgr_block_size(block); +		list_add_tail(®ion->node, &con->critical_region_head); +	} + +out: +	mutex_unlock(&con->critical_region_lock); + +	return ret; +} + +static void amdgpu_ras_critical_region_init(struct amdgpu_device *adev) +{ +	amdgpu_ras_add_critical_region(adev, adev->mman.fw_reserved_memory); +} + +static void amdgpu_ras_critical_region_fini(struct amdgpu_device *adev) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_critical_region *region, *tmp; + +	mutex_lock(&con->critical_region_lock); +	list_for_each_entry_safe(region, tmp, &con->critical_region_head, node) { +		list_del(®ion->node); +		kfree(region); +	} +	mutex_unlock(&con->critical_region_lock); +} + +bool amdgpu_ras_check_critical_address(struct amdgpu_device *adev, uint64_t addr) +{ +	struct amdgpu_ras *con = amdgpu_ras_get_context(adev); +	struct ras_critical_region *region; +	bool ret = false; + +	mutex_lock(&con->critical_region_lock); +	list_for_each_entry(region, &con->critical_region_head, node) { +		if ((region->start <= addr) && +		    (addr < (region->start + region->size))) { +			ret = true; +			break; +		} +	} +	mutex_unlock(&con->critical_region_lock); + +	return ret; +} | 
