From b293e891b05701fc89fa2b20bba377513ae92021 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 30 Aug 2019 13:29:18 +0800 Subject: drm/amdgpu: add helper function to do common ras_late_init/fini (v3) In late_init for ras, the helper function will be used to 1). disable ras feature if the IP block is masked as disabled 2). send enable feature command if the ip block was masked as enabled 3). create debugfs/sysfs node per IP block 4). register interrupt handler v2: check ih_info.cb to decide add interrupt handler or not v3: add ras_late_fini for cleanup all the ras fs node and remove interrupt handler Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6c76bb2a6843..66b71525446e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -566,6 +566,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { int amdgpu_ras_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); +int amdgpu_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_fs_if *fs_info, + struct ras_ih_if *ih_info); +void amdgpu_ras_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_ih_if *ih_info); int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable); -- cgit v1.2.3 From 7c6e68c777f109484559a35b125a773439bbd319 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 13 Sep 2019 17:40:32 -0500 Subject: drm/amdgpu: Avoid HW GPU reset for RAS. Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky Acked-by: Felix Kuehling Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 38 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 ++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++---- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++++++++--------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++++++++++------------ 12 files changed, 155 insertions(+), 42 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 324919d57c89..f6537476b542 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -35,6 +35,7 @@ #include "amdgpu_trace.h" #include "amdgpu_gmc.h" #include "amdgpu_gem.h" +#include "amdgpu_ras.h" static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, struct drm_amdgpu_cs_chunk_fence *data, @@ -1290,6 +1291,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) bool reserved_buffers = false; int i, r; + if (amdgpu_ras_intr_triggered()) + return -EHWPOISON; + if (!adev->accel_working) return -EBUSY; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a4d38637cdc6..98ff987ae940 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3736,25 +3736,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) adev->mp1_state = PP_MP1_STATE_NONE; break; } - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); return true; } static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; adev->in_gpu_reset = 0; mutex_unlock(&adev->lock_reset); } - /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -3774,11 +3767,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; + bool in_ras_intr = amdgpu_ras_intr_triggered(); need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU reset begin!\n"); + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3805,9 +3799,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, return 0; } + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); + /* Build list of devices to reset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!hive) { + /*unlock kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_post_reset(adev); amdgpu_device_unlock_adev(adev); return -ENODEV; } @@ -3825,8 +3826,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (tmp_adev != adev) + if (tmp_adev != adev) { amdgpu_device_lock_adev(tmp_adev, false); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + } + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed @@ -3834,7 +3839,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_unregister_gpu_instance(tmp_adev); /* disable ras on ALL IPs */ - if (amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3844,10 +3849,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, continue; drm_sched_stop(&ring->sched, job ? &job->base : NULL); + + if (in_ras_intr) + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } + if (in_ras_intr) + goto skip_sched_resume; + /* * Must check guilty signal here since after this point all old * HW fences are force signaled. @@ -3906,6 +3917,7 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -3932,7 +3944,13 @@ skip_hw_reset: } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); } + } +skip_sched_resume: + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + /*unlock kfd: SRIOV would do it separately */ + if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 48a2070e72f2..62fe102ed39e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -42,6 +42,8 @@ #include "amdgpu_amdkfd.h" +#include "amdgpu_ras.h" + /* * KMS wrapper. * - 3.0.0 - initial driver @@ -1098,6 +1100,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = dev->dev_private; + if (amdgpu_ras_intr_triggered()) + return; + /* if we are running in a VM, make sure the device * torn down properly on reboot/shutdown. * unfortunately we can't detect certain diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9d76e0923a5a..e1bad992e83b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) return fence; } +#define to_drm_sched_job(sched_job) \ + container_of((sched_job), struct drm_sched_job, queue_node) + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *s_job; + struct drm_sched_entity *s_entity = NULL; + int i; + + /* Signal all jobs not yet scheduled */ + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + if (!rq) + continue; + + spin_lock(&rq->lock); + list_for_each_entry(s_entity, &rq->entities, list) { + while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_signal(&s_fence->scheduled); + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } + } + spin_unlock(&rq->lock); + } + + /* Signal all jobs already scheduled to HW */ + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } +} + const struct drm_sched_backend_ops amdgpu_sched_ops = { .dependency = amdgpu_job_dependency, .run_job = amdgpu_job_run, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 51e62504c279..dc7ee9358dcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, void *owner, struct dma_fence **f); int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, struct dma_fence **fence); + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index c28dc079a0a1..e42fe034aacd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1004,6 +1004,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) /* Ensure IB tests are run on ring */ flush_delayed_work(&adev->delayed_init_work); + + if (amdgpu_ras_intr_triggered()) { + DRM_ERROR("RAS Intr triggered, device disabled!!"); + return -EHWPOISON; + } + file_priv->driver_priv = NULL; r = pm_runtime_get_sync(dev->dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0b466d101f53..d7bf8fc10869 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "amdgpu.h" #include "amdgpu_ras.h" @@ -66,6 +68,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) + +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); + static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, uint64_t offset, uint64_t size, struct amdgpu_bo **bo_ptr); @@ -190,6 +195,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } + +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + struct ras_common_if *head); + /** * DOC: AMDGPU RAS debugfs control interface * @@ -629,12 +638,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) + if (err_data.ce_count) { dev_info(adev->dev, "%ld correctable errors detected in %s block\n", obj->err_data.ce_count, ras_block_str(info->head.block)); - if (err_data.ue_count) + } + if (err_data.ue_count) { dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", obj->err_data.ue_count, ras_block_str(info->head.block)); + } return 0; } @@ -1731,3 +1742,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) return 0; } + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +{ + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 66b71525446e..6fda96b29f1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -606,4 +606,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info); + +extern atomic_t amdgpu_ras_in_intr; + +static inline bool amdgpu_ras_intr_triggered(void) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6065f363fa85..196a14236445 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5685,10 +5685,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { /* TODO ue will trigger an interrupt. */ - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, err_data); - amdgpu_ras_reset_gpu(adev, 0); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->gfx.funcs->query_ras_error_count) + adev->gfx.funcs->query_ras_error_count(adev, err_data); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 6a61e5c5b2ce..f1300d5f4f87 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -247,18 +247,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct ras_err_data *err_data, struct amdgpu_iv_entry *entry) { - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, err_data); - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, err_data); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->umc.funcs->query_ras_error_count) + adev->umc.funcs->query_ras_error_count(adev, err_data); + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + if (adev->umc.funcs->query_ras_error_address) + adev->umc.funcs->query_ras_error_address(adev, err_data); - /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) - amdgpu_ras_reset_gpu(adev, 0); + /* only uncorrectable error needs gpu reset */ + if (err_data->ue_count) + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 5e784bbd2d7f..27eeab143ad7 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -30,6 +30,7 @@ #include "nbio/nbio_7_4_0_smn.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include +#include "amdgpu_ras.h" #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 4830382ab8f2..4aabb0d9bae5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1979,24 +1979,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, uint32_t err_source; int instance; - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); - if (instance < 0) - return 0; + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); + if (instance < 0) + return 0; - switch (entry->src_id) { - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: - err_source = 0; - break; - case SDMA0_4_0__SRCID__SDMA_ECC: - err_source = 1; - break; - default: - return 0; - } + switch (entry->src_id) { + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: + err_source = 0; + break; + case SDMA0_4_0__SRCID__SDMA_ECC: + err_source = 1; + break; + default: + return 0; + } - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } -- cgit v1.2.3 From d5ea093eebf022ec69970107db45dc06318d7e5a Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 22 Aug 2019 15:01:37 -0400 Subject: dmr/amdgpu: Add system auto reboot to RAS. In case of RAS error allow user configure auto system reboot through ras_ctrl. This is also part of the temproray work around for the RAS hang problem. v4: Use latest kernel API for disk sync. Signed-off-by: Andrey Grodzovsky Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 98ff987ae940..e89aa2dc5c11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,6 +65,8 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); @@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + + DRM_WARN("Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } + need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d7bf8fc10869..270110db128f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "reboot %32s", block_name) == 1) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; + case 3: + amdgpu_ras_get_context(adev)->reboot = true; + break; default: ret = -EINVAL; break; @@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + + amdgpu_ras_reset_gpu(adev, false); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6fda96b29f1f..f487038ba331 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -334,7 +334,7 @@ struct amdgpu_ras { struct mutex recovery_lock; uint32_t flags; - + bool reboot; struct amdgpu_ras_eeprom_control eeprom_control; }; -- cgit v1.2.3 From 9dc23a6325fc3e99467dff46d53c78264d424bd3 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 13 Aug 2019 10:39:05 +0800 Subject: drm/amdgpu: change ras bps type to eeprom table record structure change bps type from retired page to eeprom table record, prepare for saving umc error records to eeprom Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 +++++++++++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--- 2 files changed, 43 insertions(+), 27 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 270110db128f..d524dec73a0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1205,14 +1205,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, for (; i < data->count; i++) { (*bps)[i] = (struct ras_badpage){ - .bp = data->bps[i].bp, + .bp = data->bps[i].retired_page, .size = AMDGPU_GPU_PAGE_SIZE, .flags = 0, }; if (data->last_reserved <= i) (*bps)[i].flags = 1; - else if (data->bps[i].bo == NULL) + else if (data->bps_bo[i] == NULL) (*bps)[i].flags = 2; } @@ -1306,30 +1306,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, { unsigned int old_space = data->count + data->space_left; unsigned int new_space = old_space + pages; - unsigned int align_space = ALIGN(new_space, 1024); - void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - - if (!tmp) + unsigned int align_space = ALIGN(new_space, 512); + void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + struct amdgpu_bo **bps_bo = + kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); + + if (!bps || !bps_bo) { + kfree(bps); + kfree(bps_bo); return -ENOMEM; + } if (data->bps) { - memcpy(tmp, data->bps, + memcpy(bps, data->bps, data->count * sizeof(*data->bps)); kfree(data->bps); } + if (data->bps_bo) { + memcpy(bps_bo, data->bps_bo, + data->count * sizeof(*data->bps_bo)); + kfree(data->bps_bo); + } - data->bps = tmp; + data->bps = bps; + data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages) + struct eeprom_table_record *bps, int pages) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - int i = pages; int ret = 0; if (!con || !con->eh_data || !bps || pages <= 0) @@ -1346,10 +1356,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, goto out; } - while (i--) - data->bps[data->count++].bp = bps[i]; - + memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); + data->count += pages; data->space_left -= pages; + out: mutex_unlock(&con->recovery_lock); @@ -1374,13 +1384,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) goto out; /* reserve vram at driver post stage. */ for (i = data->last_reserved; i < data->count; i++) { - bp = data->bps[i].bp; + bp = data->bps[i].retired_page; if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, PAGE_SIZE, &bo)) DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i + 1; } out: @@ -1405,11 +1415,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) goto out; for (i = data->last_reserved - 1; i >= 0; i--) { - bo = data->bps[i].bo; + bo = data->bps_bo[i]; amdgpu_ras_release_vram(adev, &bo); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i; } out: @@ -1425,12 +1435,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) return 0; } +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { - /* TODO - * read the array to eeprom when SMU disabled. - */ - return 0; + struct eeprom_table_record *bps = NULL; + int ret; + + ret = amdgpu_ras_add_bad_pages(adev, bps, + adev->umc.max_ras_err_cnt_per_query); + + return ret; } static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index f487038ba331..bc1d45971607 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -351,11 +351,10 @@ struct ras_err_data { }; struct ras_err_handler_data { - /* point to bad pages array */ - struct { - unsigned long bp; - struct amdgpu_bo *bo; - } *bps; + /* point to bad page records array */ + struct eeprom_table_record *bps; + /* point to reserved bo array */ + struct amdgpu_bo **bps_bo; /* the count of entries */ int count; /* the space can place new entries */ @@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages); + struct eeprom_table_record *bps, int pages); int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); -- cgit v1.2.3 From 87d2b92f1e9df64a74f7fda0691d4041ba2727f9 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 15 Aug 2019 16:15:08 +0800 Subject: drm/amdgpu: save umc error records save umc error records to ras bad page array v2: add bad pages before gpu reset v3: add NULL check for adev->umc.funcs Signed-off-by: Tao Zhou Signed-off-by: Andrey Grodzovsky Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 40 +++++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/umc_v6_1.c | 39 ++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 17 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index bc1d45971607..96210e18191e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -347,7 +347,7 @@ struct ras_err_data { unsigned long ue_count; unsigned long ce_count; unsigned long err_addr_cnt; - uint64_t *err_addr; + struct eeprom_table_record *err_addr; }; struct ras_err_handler_data { diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 2b4eb0b9d9ce..9b06d775d137 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -247,21 +247,43 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct ras_err_data *err_data, struct amdgpu_iv_entry *entry) { - if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, err_data); + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) + return AMDGPU_RAS_SUCCESS; + + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->umc.funcs && + adev->umc.funcs->query_ras_error_count) + adev->umc.funcs->query_ras_error_count(adev, err_data); + + if (adev->umc.funcs && + adev->umc.funcs->query_ras_error_address && + adev->umc.max_ras_err_cnt_per_query) { + err_data->err_addr = + kcalloc(adev->umc.max_ras_err_cnt_per_query, + sizeof(struct eeprom_table_record), GFP_KERNEL); + /* still call query_ras_error_address to clear error status + * even NOMEM error is encountered + */ + if(!err_data->err_addr) + DRM_WARN("Failed to alloc memory for umc error address record!\n"); + /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, err_data); + adev->umc.funcs->query_ras_error_address(adev, err_data); + } + + /* only uncorrectable error needs gpu reset */ + if (err_data->ue_count) { + if (err_data->err_addr_cnt && + amdgpu_ras_add_bad_pages(adev, err_data->err_addr, + err_data->err_addr_cnt)) + DRM_WARN("Failed to add ras bad page!\n"); - /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev, 0); } + kfree(err_data->err_addr); return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c index 8502e736f721..09e316a22f1a 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_1.c @@ -75,6 +75,17 @@ static void umc_v6_1_disable_umc_index_mode(struct amdgpu_device *adev) RSMU_UMC_INDEX_MODE_EN, 0); } +static uint32_t umc_v6_1_get_umc_inst(struct amdgpu_device *adev) +{ + uint32_t rsmu_umc_index; + + rsmu_umc_index = RREG32_SOC15(RSMU, 0, + mmRSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU); + return REG_GET_FIELD(rsmu_umc_index, + RSMU_UMC_INDEX_REGISTER_NBIF_VG20_GPU, + RSMU_UMC_INDEX_INSTANCE); +} + static void umc_v6_1_query_correctable_error_count(struct amdgpu_device *adev, uint32_t umc_reg_offset, unsigned long *error_count) @@ -165,7 +176,8 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, uint32_t umc_reg_offset, uint32_t channel_index) { uint32_t lsb, mc_umc_status_addr; - uint64_t mc_umc_status, err_addr; + uint64_t mc_umc_status, err_addr, retired_page; + struct eeprom_table_record *err_rec; mc_umc_status_addr = SOC15_REG_OFFSET(UMC, 0, mmMCA_UMC_UMC0_MCUMC_STATUST0); @@ -177,6 +189,7 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, return; } + err_rec = &err_data->err_addr[err_data->err_addr_cnt]; mc_umc_status = RREG64_UMC(mc_umc_status_addr + umc_reg_offset); /* calculate error address if ue/ce error is detected */ @@ -191,12 +204,24 @@ static void umc_v6_1_query_error_address(struct amdgpu_device *adev, err_addr &= ~((0x1ULL << lsb) - 1); /* translate umc channel address to soc pa, 3 parts are included */ - err_data->err_addr[err_data->err_addr_cnt] = - ADDR_OF_8KB_BLOCK(err_addr) | - ADDR_OF_256B_BLOCK(channel_index) | - OFFSET_IN_256B_BLOCK(err_addr); - - err_data->err_addr_cnt++; + retired_page = ADDR_OF_8KB_BLOCK(err_addr) | + ADDR_OF_256B_BLOCK(channel_index) | + OFFSET_IN_256B_BLOCK(err_addr); + + /* we only save ue error information currently, ce is skipped */ + if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) + == 1) { + err_rec->address = err_addr; + /* page frame address is saved */ + err_rec->retired_page = retired_page >> PAGE_SHIFT; + err_rec->ts = (uint64_t)ktime_get_real_seconds(); + err_rec->err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + err_rec->cu = 0; + err_rec->mem_channel = channel_index; + err_rec->mcumc_id = umc_v6_1_get_umc_inst(adev); + + err_data->err_addr_cnt++; + } } /* clear umc status */ -- cgit v1.2.3 From 1a6fc071e1991321d3b6a00e0e7c733a462a4418 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 30 Aug 2019 19:50:39 +0800 Subject: drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place ras recovery_init should be called after ttm init, bad page reserve should be put in front of gpu reset since i2c may be unstable during gpu reset. add cleanup for recovery_init and recovery_fini v2: add more comment and print. remove cancel_work_sync in recovery_init. Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++++ 4 files changed, 43 insertions(+), 18 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3ad034aa0e3c..3268291babf8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, break; } } - - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - amdgpu_ras_reserve_bad_pages(tmp_adev); - } } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 53540e067d15..e9bd40ea7ce0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1493,16 +1493,17 @@ out: return 0; } -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data = &con->eh_data; int ret; - *data = kmalloc(sizeof(**data), - GFP_KERNEL|__GFP_ZERO); - if (!*data) - return -ENOMEM; + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); + if (!*data) { + ret = -ENOMEM; + goto out; + } mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); @@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); if (ret) - return ret; + goto free; if (adev->psp.ras.ras->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) - return ret; + goto free; ret = amdgpu_ras_reserve_bad_pages(adev); if (ret) - return ret; + goto release; } return 0; + +release: + amdgpu_ras_release_bad_pages(adev); +free: + con->eh_data = NULL; + kfree((*data)->bps); + kfree((*data)->bps_bo); + kfree(*data); +out: + DRM_WARN("Failed to initialize ras recovery!\n"); + + return ret; } static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; + /* recovery_init failed to init it, fini is useless */ + if (!data) + return 0; + cancel_work_sync(&con->recovery_work); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); + kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) return r; } - if (amdgpu_ras_recovery_init(adev)) - goto recovery_out; - amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; if (amdgpu_ras_fs_init(adev)) @@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) con->hw_supported, con->supported); return 0; fs_out: - amdgpu_ras_recovery_fini(adev); -recovery_out: amdgpu_ras_set_context(adev, NULL); kfree(con); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 96210e18191e..012034d2ae06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ras && (ras->supported & (1 << block)); } +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, unsigned int block); @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* save bad page to eeprom before gpu reset, + * i2c may be unstable in gpu reset + */ + amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) schedule_work(&ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 13b144c8f67d..54e6dacc34a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -54,6 +54,7 @@ #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" #include "amdgpu_sdma.h" +#include "amdgpu_ras.h" #include "bif/bif_4_1_d.h" static int amdgpu_map_buffer(struct ttm_buffer_object *bo, @@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) adev->gmc.visible_vram_size); #endif + /* + * retired pages will be loaded from eeprom and reserved here, + * it should be called after ttm init since new bo may be created, + * recovery_init may fail, but it can free all resources allocated by + * itself and its failure should not stop amdgpu init process. + * + * Note: theoretically, this should be called before all vram allocations + * to protect retired page from abusing + */ + amdgpu_ras_recovery_init(adev); + /* *The reserved vram for firmware must be pinned to the specified *place on the VRAM, so reserve it early. -- cgit v1.2.3 From 708901a6664fdc8a39d7946a3995d46eca4fb3e9 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Tue, 10 Sep 2019 15:34:16 -0400 Subject: drm/amdgpu: Fix mutex lock from atomic context. Problem: amdgpu_ras_reserve_bad_pages was moved to amdgpu_ras_reset_gpu because writing to EEPROM during ASIC reset was unstable. But for ERREVENT_ATHUB_INTERRUPT amdgpu_ras_reset_gpu is called directly from ISR context and so locking is not allowed. Also it's irrelevant for this partilcular interrupt as this is generic RAS interrupt and not memory errors specific. Fix: Avoid calling amdgpu_ras_reserve_bad_pages if not in task context. Signed-off-by: Andrey Grodzovsky Reviewed-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 012034d2ae06..dd5da3c6327e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -504,7 +504,9 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, /* save bad page to eeprom before gpu reset, * i2c may be unstable in gpu reset */ - amdgpu_ras_reserve_bad_pages(adev); + if (in_task()) + amdgpu_ras_reserve_bad_pages(adev); + if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) schedule_work(&ras->recovery_work); return 0; -- cgit v1.2.3 From 012dd14d1de63b7443a8cec4ad9046c85a7184a5 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 16 Sep 2019 13:42:46 +0800 Subject: drm/amdgpu: fix ras ctrl debugfs node leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use debugfs_remove_recursive to remove the whole debugfs directory instead of removing the node one by one. Signed-off-by: Guchun Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 +++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index faf6863ca785..daf9ac0b711f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -980,10 +980,10 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct drm_minor *minor = adev->ddev->primary; con->dir = debugfs_create_dir("ras", minor->debugfs_root); - con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_ctrl_ops); - con->ent = debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_eeprom_ops); + debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_ctrl_ops); + debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_eeprom_ops); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, @@ -1028,10 +1028,8 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_remove(adev, &obj->head); } - debugfs_remove(con->ent); - debugfs_remove(con->dir); + debugfs_remove_recursive(con->dir); con->dir = NULL; - con->ent = NULL; } /* debugfs end */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index dd5da3c6327e..ae386c466c0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -317,8 +317,6 @@ struct amdgpu_ras { struct list_head head; /* debugfs */ struct dentry *dir; - /* debugfs ctrl */ - struct dentry *ent; /* sysfs */ struct device_attribute features_attr; struct bin_attribute badpages_attr; -- cgit v1.2.3 From f5f06e21e9707552962dadc55fe4412b913223a9 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 12 Sep 2019 13:38:44 +0800 Subject: drm/amdgpu: update parameter of ras_ih_cb change struct ras_err_data *err_data to void *err_data, align with umc code and the callback's declaration in each ras block could pay no attention to the structure type Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ae386c466c0e..f80fd3428c98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -362,7 +362,7 @@ struct ras_err_handler_data { }; typedef int (*ras_ih_cb)(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry); struct ras_ih_data { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 5365b30dfaea..bced3130c240 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4204,7 +4204,7 @@ static int gfx_v9_0_early_init(void *handle) } static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry); static int gfx_v9_0_ecc_late_init(void *handle) @@ -5457,7 +5457,7 @@ static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev, } static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry) { /* TODO ue will trigger an interrupt. */ diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 7f6536c5e549..8910a18547b0 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -246,16 +246,18 @@ static int gmc_v9_0_ecc_interrupt_state(struct amdgpu_device *adev, } static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *ras_error_status, struct amdgpu_iv_entry *entry) { + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; + if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) return AMDGPU_RAS_SUCCESS; kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); if (adev->umc.funcs && adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, err_data); + adev->umc.funcs->query_ras_error_count(adev, ras_error_status); if (adev->umc.funcs && adev->umc.funcs->query_ras_error_address && diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 265b9c01a763..ae098e2d5dcb 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1690,7 +1690,7 @@ static int sdma_v4_0_early_init(void *handle) } static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry); static int sdma_v4_0_late_init(void *handle) @@ -1939,7 +1939,7 @@ static int sdma_v4_0_process_trap_irq(struct amdgpu_device *adev, } static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, - struct ras_err_data *err_data, + void *err_data, struct amdgpu_iv_entry *entry) { uint32_t err_source; -- cgit v1.2.3