From b0d4783a382297048cb00e3e94078aa301296841 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Fri, 22 May 2020 15:50:15 +0800 Subject: drm/amdgpu: print warning when input address is invalid This will assist debug in error injection case. Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 50fe08bf2f72..9475891ee989 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -318,6 +318,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * case 2: if ((data.inject.address >= adev->gmc.mc_vram_size) || (data.inject.address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + dev_warn(adev->dev, "RAS WARN: input address " + "0x%llx is invalid.", + data.inject.address); ret = -EINVAL; break; } -- cgit v1.2.3 From 5e91160ac0b5cfbbaeb62cbff8b069262095f744 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Tue, 2 Jun 2020 13:46:22 +0800 Subject: drm/amdgpu: fix RAS memory leak in error case RAS context memory needs to freed in failure case. Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9475891ee989..c4ccc7f6637d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1917,9 +1917,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev, &con->hw_supported, &con->supported); if (!con->hw_supported) { - amdgpu_ras_set_context(adev, NULL); - kfree(con); - return 0; + r = 0; + goto err_out; } con->features = 0; @@ -1930,29 +1929,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (adev->nbio.funcs->init_ras_controller_interrupt) { r = adev->nbio.funcs->init_ras_controller_interrupt(adev); if (r) - return r; + goto err_out; } if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); if (r) - return r; + goto err_out; } amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; - if (amdgpu_ras_fs_init(adev)) - goto fs_out; + if (amdgpu_ras_fs_init(adev)) { + r = -EINVAL; + goto err_out; + } dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); return 0; -fs_out: +err_out: amdgpu_ras_set_context(adev, NULL); kfree(con); - return -EINVAL; + return r; } /* helper function to handle common stuff in ip late init phase */ -- cgit v1.2.3 From 9e69b1ee1d9e1d58244279e39f032658df8cead6 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Tue, 2 Jun 2020 13:53:09 +0800 Subject: drm/amdgpu: remove useless code in RAS Module parameter amdgpu_ras_mask has been involved in the calculation of ras support capability, so drop this redundant code. Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c4ccc7f6637d..9bfe59b70030 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1938,8 +1938,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto err_out; } - amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; - if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto err_out; -- cgit v1.2.3 From f3167919f689c9b547aca82ec1eeb455e4eb120b Mon Sep 17 00:00:00 2001 From: Nirmoy Das Date: Thu, 18 Jun 2020 16:09:12 +0200 Subject: drm/amdgpu: label internally used symbols as static Used sparse(make C=1) to find these loose ends. v2: removed unwanted extra line Signed-off-by: Nirmoy Das Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 +- 9 files changed, 14 insertions(+), 14 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index b5d6274952a5..4ef38c2411ae 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -32,7 +32,7 @@ #define mmMM_DATA 0x1 #define HW_ID_MAX 300 -const char *hw_id_names[HW_ID_MAX] = { +static const char *hw_id_names[HW_ID_MAX] = { [MP1_HWID] = "MP1", [MP2_HWID] = "MP2", [THM_HWID] = "THM", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c index 6ae80b33182c..e811fecc540f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c @@ -32,7 +32,7 @@ #define I2C_PRODUCT_INFO_ADDR_SIZE 0x2 #define I2C_PRODUCT_INFO_OFFSET 0xC0 -bool is_fru_eeprom_supported(struct amdgpu_device *adev) +static bool is_fru_eeprom_supported(struct amdgpu_device *adev) { /* TODO: Gaming SKUs don't have the FRU EEPROM. * Use this hack to address hangs on modprobe on gaming SKUs @@ -48,7 +48,7 @@ bool is_fru_eeprom_supported(struct amdgpu_device *adev) return false; } -int amdgpu_fru_read_eeprom(struct amdgpu_device *adev, uint32_t addrptr, +static int amdgpu_fru_read_eeprom(struct amdgpu_device *adev, uint32_t addrptr, unsigned char *buff) { int ret, size; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 60558497f054..0723dee2958b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -583,7 +583,7 @@ static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd, cmd->cmd.cmd_invoke_cmd.ta_cmd_id = ta_cmd_id; } -int psp_ta_invoke(struct psp_context *psp, +static int psp_ta_invoke(struct psp_context *psp, uint32_t ta_cmd_id, uint32_t session_id) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 9bfe59b70030..ab8e7c91c645 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -86,7 +86,7 @@ void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready) amdgpu_ras_get_context(adev)->error_query_ready = ready; } -bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) +static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) { if (adev && amdgpu_ras_get_context(adev)) return amdgpu_ras_get_context(adev)->error_query_ready; @@ -505,7 +505,7 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, } /* obj end */ -void amdgpu_ras_parse_status_code(struct amdgpu_device* adev, +static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, const char* invoke_type, const char* block_name, enum ta_ras_status ret) @@ -815,7 +815,7 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, } /* Trigger XGMI/WAFL error */ -int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, +static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev, struct ta_ras_trigger_error_input *block_info) { int ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 48298b66a2a7..2801ea8954b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1112,7 +1112,7 @@ static void amdgpu_ttm_tt_unpin_userptr(struct ttm_tt *ttm) #endif } -int amdgpu_ttm_gart_bind(struct amdgpu_device *adev, +static int amdgpu_ttm_gart_bind(struct amdgpu_device *adev, struct ttm_buffer_object *tbo, uint64_t flags) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 0891f27ba166..da233a9e429d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -515,12 +515,12 @@ void amdgpu_detect_virtualization(struct amdgpu_device *adev) } } -bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) +static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) { return amdgpu_sriov_is_debug(adev) ? true : false; } -bool amdgpu_virt_access_debugfs_is_kiq(struct amdgpu_device *adev) +static bool amdgpu_virt_access_debugfs_is_kiq(struct amdgpu_device *adev) { return amdgpu_sriov_is_normal(adev) ? true : false; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 0cc011f9190d..4aec76049a60 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -3039,7 +3039,7 @@ static void gfx_v7_0_mqd_init(struct amdgpu_device *adev, mqd->cp_hqd_active = 1; } -int gfx_v7_0_mqd_commit(struct amdgpu_device *adev, struct cik_mqd *mqd) +static int gfx_v7_0_mqd_commit(struct amdgpu_device *adev, struct cik_mqd *mqd) { uint32_t tmp; uint32_t mqd_reg; @@ -5209,7 +5209,7 @@ static void gfx_v7_0_get_cu_info(struct amdgpu_device *adev) cu_info->lds_size = 64; } -const struct amdgpu_ip_block_version gfx_v7_0_ip_block = +static const struct amdgpu_ip_block_version gfx_v7_0_ip_block = { .type = AMD_IP_BLOCK_TYPE_GFX, .major = 7, diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 1d4128227ffd..efb759b62d21 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4589,7 +4589,7 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring) return 0; } -int gfx_v8_0_mqd_commit(struct amdgpu_device *adev, +static int gfx_v8_0_mqd_commit(struct amdgpu_device *adev, struct vi_mqd *mqd) { uint32_t mqd_reg; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index c3e59b765268..99ffc3e1fddc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -722,7 +722,7 @@ static const u32 GFX_RLC_SRM_INDEX_CNTL_DATA_OFFSETS[] = mmRLC_SRM_INDEX_CNTL_DATA_7 - mmRLC_SRM_INDEX_CNTL_DATA_0, }; -void gfx_v9_0_rlcg_wreg(struct amdgpu_device *adev, u32 offset, u32 v) +static void gfx_v9_0_rlcg_wreg(struct amdgpu_device *adev, u32 offset, u32 v) { static void *scratch_reg0; static void *scratch_reg1; -- cgit v1.2.3 From bb5c7235eaafb4e2f957e9f0f71a187db5cf525a Mon Sep 17 00:00:00 2001 From: Wenhui Sheng Date: Mon, 13 Jul 2020 15:14:30 +0800 Subject: drm/amdgpu: RAS emergency restart logic refine If we are in RAS triggered situation and BACO isn't support, emergency restart is needed, and this code is only needed for some specific cases(vega20 with given smu fw version). After we add smu mode1 reset for sienna cichlid, we need to share AMD_RESET_METHOD_MODE1 with psp mode1 reset, so in amdgpu_device_gpu_recover, we need differentiate which mode1 reset we are using, then decide if it's a full reset and then decide if emergency restart is needed, the logic will become much more complex. After discussion with Hawking, move emergency restart logic to an independent function. Signed-off-by: Likun Gao Signed-off-by: Wenhui Sheng Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 3 files changed, 24 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 81ca92127c00..3bf4ca2c5b25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4245,18 +4245,19 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; - bool in_ras_intr = amdgpu_ras_intr_triggered(); - bool use_baco = - (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ? - true : false; + bool need_emergency_restart = false; bool audio_suspended = false; + /** + * Special case: RAS triggered and full reset isn't supported + */ + need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); + /* * Flush RAM to disk so that after reboot * the user can read log and see why the system rebooted. */ - if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) { - + if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) { DRM_WARN("Emergency reboot."); ksys_sync_helper(); @@ -4264,7 +4265,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } dev_info(adev->dev, "GPU %s begin!\n", - (in_ras_intr && !use_baco) ? "jobs stop":"reset"); + need_emergency_restart ? "jobs stop":"reset"); /* * Here we trylock to avoid chain of resets executing from @@ -4336,7 +4337,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_fbdev_set_suspend(tmp_adev, 1); /* disable ras on ALL IPs */ - if (!(in_ras_intr && !use_baco) && + if (!need_emergency_restart && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -4348,12 +4349,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, drm_sched_stop(&ring->sched, job ? &job->base : NULL); - if (in_ras_intr && !use_baco) + if (need_emergency_restart) amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } - if (in_ras_intr && !use_baco) + if (need_emergency_restart) goto skip_sched_resume; /* @@ -4430,7 +4431,7 @@ skip_hw_reset: skip_sched_resume: list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { /*unlock kfd: SRIOV would do it separately */ - if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev)) + if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) amdgpu_amdkfd_post_reset(tmp_adev); if (audio_suspended) amdgpu_device_resume_display_audio(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index ab8e7c91c645..e10f02ed3f65 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2131,3 +2131,14 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) amdgpu_ras_reset_gpu(adev); } } + +bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) +{ + if (adev->asic_type == CHIP_VEGA20 && + adev->pm.fw_version <= 0x283400) { + return !(amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) && + amdgpu_ras_intr_triggered(); + } + + return false; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e7df5d8429f8..b2667342cf67 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -633,4 +633,5 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready); +bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev); #endif -- cgit v1.2.3 From ee10e06eb00c3b371fa17a13ee2adaee4254dd54 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 20 Jul 2020 11:11:13 +0800 Subject: drm/amdgpu: add printing after executing page reservation to eeprom This will tell users if the faulty page has been written to external eeprom device in dmesg log. Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e10f02ed3f65..bcce4c0be462 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1618,7 +1618,7 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) data = con->eh_data; save_count = data->count - control->num_recs; /* only new entries are saved */ - if (save_count > 0) + if (save_count > 0) { if (amdgpu_ras_eeprom_process_recods(control, &data->bps[control->num_recs], true, @@ -1627,6 +1627,9 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) return -EIO; } + dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); + } + return 0; } -- cgit v1.2.3