From 4e644fffb5809d260bb7adcc49cb87d79b88ba3d Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Wed, 5 Jun 2019 14:57:00 +0800 Subject: drm/amdgpu: add ras_controller and err_event_athub interrupt support Ras controller interrupt and Ras err event athub interrupt are two dedicated interrupts for RAS support. Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 016ea274b955..f7180109bef4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -29,6 +29,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" #include "amdgpu_atomfirmware.h" +#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" const char *ras_error_string[] = { "none", @@ -1500,6 +1501,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int r; if (con) return 0; @@ -1527,6 +1529,18 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; + if (adev->nbio.funcs->init_ras_controller_interrupt) { + r = adev->nbio.funcs->init_ras_controller_interrupt(adev); + if (r) + return r; + } + + if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { + r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); + if (r) + return r; + } + if (amdgpu_ras_recovery_init(adev)) goto recovery_out; -- cgit v1.2.3 From b293e891b05701fc89fa2b20bba377513ae92021 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Fri, 30 Aug 2019 13:29:18 +0800 Subject: drm/amdgpu: add helper function to do common ras_late_init/fini (v3) In late_init for ras, the helper function will be used to 1). disable ras feature if the IP block is masked as disabled 2). send enable feature command if the ip block was masked as enabled 3). create debugfs/sysfs node per IP block 4). register interrupt handler v2: check ih_info.cb to decide add interrupt handler or not v3: add ras_late_fini for cleanup all the ras fs node and remove interrupt handler Signed-off-by: Hawking Zhang Reviewed-by: Alex Deucher Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 72 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 7 ++++ 2 files changed, 79 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f7180109bef4..0b466d101f53 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1566,6 +1566,78 @@ recovery_out: return -EINVAL; } +/* helper function to handle common stuff in ip late init phase */ +int amdgpu_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_fs_if *fs_info, + struct ras_ih_if *ih_info) +{ + int r; + + /* disable RAS feature per IP block if it is not supported */ + if (!amdgpu_ras_is_supported(adev, ras_block->block)) { + amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0); + return 0; + } + + r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1); + if (r) { + if (r == -EAGAIN) { + /* request gpu reset. will run again */ + amdgpu_ras_request_reset_on_boot(adev, + ras_block->block); + return 0; + } else if (adev->in_suspend || adev->in_gpu_reset) { + /* in resume phase, if fail to enable ras, + * clean up all ras fs nodes, and disable ras */ + goto cleanup; + } else + return r; + } + + /* in resume phase, no need to create ras fs node */ + if (adev->in_suspend || adev->in_gpu_reset) + return 0; + + if (ih_info->cb) { + r = amdgpu_ras_interrupt_add_handler(adev, ih_info); + if (r) + goto interrupt; + } + + amdgpu_ras_debugfs_create(adev, fs_info); + + r = amdgpu_ras_sysfs_create(adev, fs_info); + if (r) + goto sysfs; + + return 0; +cleanup: + amdgpu_ras_sysfs_remove(adev, ras_block); +sysfs: + amdgpu_ras_debugfs_remove(adev, ras_block); + if (ih_info->cb) + amdgpu_ras_interrupt_remove_handler(adev, ih_info); +interrupt: + amdgpu_ras_feature_enable(adev, ras_block, 0); + return r; +} + +/* helper function to remove ras fs node and interrupt handler */ +void amdgpu_ras_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_ih_if *ih_info) +{ + if (!ras_block || !ih_info) + return; + + amdgpu_ras_sysfs_remove(adev, ras_block); + amdgpu_ras_debugfs_remove(adev, ras_block); + if (ih_info->cb) + amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_feature_enable(adev, ras_block, 0); +} + /* do some init work after IP late init as dependence. * and it runs in resume/gpu reset/booting up cases. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6c76bb2a6843..66b71525446e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -566,6 +566,13 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { int amdgpu_ras_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); +int amdgpu_ras_late_init(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_fs_if *fs_info, + struct ras_ih_if *ih_info); +void amdgpu_ras_late_fini(struct amdgpu_device *adev, + struct ras_common_if *ras_block, + struct ras_ih_if *ih_info); int amdgpu_ras_feature_enable(struct amdgpu_device *adev, struct ras_common_if *head, bool enable); -- cgit v1.2.3 From 7c6e68c777f109484559a35b125a773439bbd319 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 13 Sep 2019 17:40:32 -0500 Subject: drm/amdgpu: Avoid HW GPU reset for RAS. Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky Acked-by: Felix Kuehling Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 38 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 ++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++---- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++++++++--------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++++++++++------------ 12 files changed, 155 insertions(+), 42 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 324919d57c89..f6537476b542 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -35,6 +35,7 @@ #include "amdgpu_trace.h" #include "amdgpu_gmc.h" #include "amdgpu_gem.h" +#include "amdgpu_ras.h" static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, struct drm_amdgpu_cs_chunk_fence *data, @@ -1290,6 +1291,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) bool reserved_buffers = false; int i, r; + if (amdgpu_ras_intr_triggered()) + return -EHWPOISON; + if (!adev->accel_working) return -EBUSY; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a4d38637cdc6..98ff987ae940 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3736,25 +3736,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) adev->mp1_state = PP_MP1_STATE_NONE; break; } - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); return true; } static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; adev->in_gpu_reset = 0; mutex_unlock(&adev->lock_reset); } - /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -3774,11 +3767,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; + bool in_ras_intr = amdgpu_ras_intr_triggered(); need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU reset begin!\n"); + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3805,9 +3799,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, return 0; } + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); + /* Build list of devices to reset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!hive) { + /*unlock kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_post_reset(adev); amdgpu_device_unlock_adev(adev); return -ENODEV; } @@ -3825,8 +3826,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (tmp_adev != adev) + if (tmp_adev != adev) { amdgpu_device_lock_adev(tmp_adev, false); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + } + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed @@ -3834,7 +3839,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_unregister_gpu_instance(tmp_adev); /* disable ras on ALL IPs */ - if (amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3844,10 +3849,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, continue; drm_sched_stop(&ring->sched, job ? &job->base : NULL); + + if (in_ras_intr) + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } + if (in_ras_intr) + goto skip_sched_resume; + /* * Must check guilty signal here since after this point all old * HW fences are force signaled. @@ -3906,6 +3917,7 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -3932,7 +3944,13 @@ skip_hw_reset: } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); } + } +skip_sched_resume: + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + /*unlock kfd: SRIOV would do it separately */ + if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 48a2070e72f2..62fe102ed39e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -42,6 +42,8 @@ #include "amdgpu_amdkfd.h" +#include "amdgpu_ras.h" + /* * KMS wrapper. * - 3.0.0 - initial driver @@ -1098,6 +1100,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = dev->dev_private; + if (amdgpu_ras_intr_triggered()) + return; + /* if we are running in a VM, make sure the device * torn down properly on reboot/shutdown. * unfortunately we can't detect certain diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9d76e0923a5a..e1bad992e83b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) return fence; } +#define to_drm_sched_job(sched_job) \ + container_of((sched_job), struct drm_sched_job, queue_node) + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *s_job; + struct drm_sched_entity *s_entity = NULL; + int i; + + /* Signal all jobs not yet scheduled */ + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + if (!rq) + continue; + + spin_lock(&rq->lock); + list_for_each_entry(s_entity, &rq->entities, list) { + while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_signal(&s_fence->scheduled); + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } + } + spin_unlock(&rq->lock); + } + + /* Signal all jobs already scheduled to HW */ + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } +} + const struct drm_sched_backend_ops amdgpu_sched_ops = { .dependency = amdgpu_job_dependency, .run_job = amdgpu_job_run, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 51e62504c279..dc7ee9358dcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, void *owner, struct dma_fence **f); int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, struct dma_fence **fence); + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index c28dc079a0a1..e42fe034aacd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1004,6 +1004,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) /* Ensure IB tests are run on ring */ flush_delayed_work(&adev->delayed_init_work); + + if (amdgpu_ras_intr_triggered()) { + DRM_ERROR("RAS Intr triggered, device disabled!!"); + return -EHWPOISON; + } + file_priv->driver_priv = NULL; r = pm_runtime_get_sync(dev->dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0b466d101f53..d7bf8fc10869 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "amdgpu.h" #include "amdgpu_ras.h" @@ -66,6 +68,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) + +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); + static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, uint64_t offset, uint64_t size, struct amdgpu_bo **bo_ptr); @@ -190,6 +195,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } + +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + struct ras_common_if *head); + /** * DOC: AMDGPU RAS debugfs control interface * @@ -629,12 +638,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) + if (err_data.ce_count) { dev_info(adev->dev, "%ld correctable errors detected in %s block\n", obj->err_data.ce_count, ras_block_str(info->head.block)); - if (err_data.ue_count) + } + if (err_data.ue_count) { dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", obj->err_data.ue_count, ras_block_str(info->head.block)); + } return 0; } @@ -1731,3 +1742,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) return 0; } + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +{ + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 66b71525446e..6fda96b29f1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -606,4 +606,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info); + +extern atomic_t amdgpu_ras_in_intr; + +static inline bool amdgpu_ras_intr_triggered(void) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6065f363fa85..196a14236445 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5685,10 +5685,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { /* TODO ue will trigger an interrupt. */ - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, err_data); - amdgpu_ras_reset_gpu(adev, 0); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->gfx.funcs->query_ras_error_count) + adev->gfx.funcs->query_ras_error_count(adev, err_data); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 6a61e5c5b2ce..f1300d5f4f87 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -247,18 +247,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct ras_err_data *err_data, struct amdgpu_iv_entry *entry) { - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, err_data); - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, err_data); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->umc.funcs->query_ras_error_count) + adev->umc.funcs->query_ras_error_count(adev, err_data); + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + if (adev->umc.funcs->query_ras_error_address) + adev->umc.funcs->query_ras_error_address(adev, err_data); - /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) - amdgpu_ras_reset_gpu(adev, 0); + /* only uncorrectable error needs gpu reset */ + if (err_data->ue_count) + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 5e784bbd2d7f..27eeab143ad7 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -30,6 +30,7 @@ #include "nbio/nbio_7_4_0_smn.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include +#include "amdgpu_ras.h" #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 4830382ab8f2..4aabb0d9bae5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1979,24 +1979,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, uint32_t err_source; int instance; - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); - if (instance < 0) - return 0; + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); + if (instance < 0) + return 0; - switch (entry->src_id) { - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: - err_source = 0; - break; - case SDMA0_4_0__SRCID__SDMA_ECC: - err_source = 1; - break; - default: - return 0; - } + switch (entry->src_id) { + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: + err_source = 0; + break; + case SDMA0_4_0__SRCID__SDMA_ECC: + err_source = 1; + break; + default: + return 0; + } - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } -- cgit v1.2.3 From d5ea093eebf022ec69970107db45dc06318d7e5a Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 22 Aug 2019 15:01:37 -0400 Subject: dmr/amdgpu: Add system auto reboot to RAS. In case of RAS error allow user configure auto system reboot through ras_ctrl. This is also part of the temproray work around for the RAS hang problem. v4: Use latest kernel API for disk sync. Signed-off-by: Andrey Grodzovsky Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 98ff987ae940..e89aa2dc5c11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,6 +65,8 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); @@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + + DRM_WARN("Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } + need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d7bf8fc10869..270110db128f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "reboot %32s", block_name) == 1) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; + case 3: + amdgpu_ras_get_context(adev)->reboot = true; + break; default: ret = -EINVAL; break; @@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + + amdgpu_ras_reset_gpu(adev, false); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6fda96b29f1f..f487038ba331 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -334,7 +334,7 @@ struct amdgpu_ras { struct mutex recovery_lock; uint32_t flags; - + bool reboot; struct amdgpu_ras_eeprom_control eeprom_control; }; -- cgit v1.2.3 From 9dc23a6325fc3e99467dff46d53c78264d424bd3 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Tue, 13 Aug 2019 10:39:05 +0800 Subject: drm/amdgpu: change ras bps type to eeprom table record structure change bps type from retired page to eeprom table record, prepare for saving umc error records to eeprom Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 59 +++++++++++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 11 +++--- 2 files changed, 43 insertions(+), 27 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 270110db128f..d524dec73a0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1205,14 +1205,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, for (; i < data->count; i++) { (*bps)[i] = (struct ras_badpage){ - .bp = data->bps[i].bp, + .bp = data->bps[i].retired_page, .size = AMDGPU_GPU_PAGE_SIZE, .flags = 0, }; if (data->last_reserved <= i) (*bps)[i].flags = 1; - else if (data->bps[i].bo == NULL) + else if (data->bps_bo[i] == NULL) (*bps)[i].flags = 2; } @@ -1306,30 +1306,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, { unsigned int old_space = data->count + data->space_left; unsigned int new_space = old_space + pages; - unsigned int align_space = ALIGN(new_space, 1024); - void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - - if (!tmp) + unsigned int align_space = ALIGN(new_space, 512); + void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); + struct amdgpu_bo **bps_bo = + kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); + + if (!bps || !bps_bo) { + kfree(bps); + kfree(bps_bo); return -ENOMEM; + } if (data->bps) { - memcpy(tmp, data->bps, + memcpy(bps, data->bps, data->count * sizeof(*data->bps)); kfree(data->bps); } + if (data->bps_bo) { + memcpy(bps_bo, data->bps_bo, + data->count * sizeof(*data->bps_bo)); + kfree(data->bps_bo); + } - data->bps = tmp; + data->bps = bps; + data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages) + struct eeprom_table_record *bps, int pages) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - int i = pages; int ret = 0; if (!con || !con->eh_data || !bps || pages <= 0) @@ -1346,10 +1356,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, goto out; } - while (i--) - data->bps[data->count++].bp = bps[i]; - + memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); + data->count += pages; data->space_left -= pages; + out: mutex_unlock(&con->recovery_lock); @@ -1374,13 +1384,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) goto out; /* reserve vram at driver post stage. */ for (i = data->last_reserved; i < data->count; i++) { - bp = data->bps[i].bp; + bp = data->bps[i].retired_page; if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, PAGE_SIZE, &bo)) DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i + 1; } out: @@ -1405,11 +1415,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) goto out; for (i = data->last_reserved - 1; i >= 0; i--) { - bo = data->bps[i].bo; + bo = data->bps_bo[i]; amdgpu_ras_release_vram(adev, &bo); - data->bps[i].bo = bo; + data->bps_bo[i] = bo; data->last_reserved = i; } out: @@ -1425,12 +1435,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) return 0; } +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { - /* TODO - * read the array to eeprom when SMU disabled. - */ - return 0; + struct eeprom_table_record *bps = NULL; + int ret; + + ret = amdgpu_ras_add_bad_pages(adev, bps, + adev->umc.max_ras_err_cnt_per_query); + + return ret; } static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index f487038ba331..bc1d45971607 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -351,11 +351,10 @@ struct ras_err_data { }; struct ras_err_handler_data { - /* point to bad pages array */ - struct { - unsigned long bp; - struct amdgpu_bo *bo; - } *bps; + /* point to bad page records array */ + struct eeprom_table_record *bps; + /* point to reserved bo array */ + struct amdgpu_bo **bps_bo; /* the count of entries */ int count; /* the space can place new entries */ @@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, - unsigned long *bps, int pages); + struct eeprom_table_record *bps, int pages); int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); -- cgit v1.2.3 From 78ad00c9030cf872621ef4c14c20fbe7d34d2c06 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 15 Aug 2019 14:55:55 +0800 Subject: drm/amdgpu: Hook EEPROM table to RAS support eeprom records load and save for ras, move EEPROM records storing to bad page reserving v2: remove redundant check for con->eh_data Signed-off-by: Tao Zhou Signed-off-by: Andrey Grodzovsky Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 109 ++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 28 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d524dec73a0f..53540e067d15 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1366,6 +1366,69 @@ out: return ret; } +/* + * write error record array to eeprom, the function should be + * protected by recovery_lock + */ +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras.ras->eeprom_control; + int save_count; + + if (!con || !con->eh_data) + return 0; + + data = con->eh_data; + save_count = data->count - control->num_recs; + /* only new entries are saved */ + if (save_count > 0) + if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control, + &data->bps[control->num_recs], + true, + save_count)) { + DRM_ERROR("Failed to save EEPROM table data!"); + return -EIO; + } + + return 0; +} + +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) +{ + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras.ras->eeprom_control; + struct eeprom_table_record *bps = NULL; + int ret = 0; + + /* no bad page record, skip eeprom access */ + if (!control->num_recs) + return ret; + + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + if (amdgpu_ras_eeprom_process_recods(control, bps, false, + control->num_recs)) { + DRM_ERROR("Failed to load EEPROM table records!"); + ret = -EIO; + goto out; + } + + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + +out: + kfree(bps); + return ret; +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1373,7 +1436,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) struct ras_err_handler_data *data; uint64_t bp; struct amdgpu_bo *bo; - int i; + int i, ret = 0; if (!con || !con->eh_data) return 0; @@ -1393,9 +1456,12 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) data->bps_bo[i] = bo; data->last_reserved = i + 1; } + + /* continue to save bad pages to eeprom even reesrve_vram fails */ + ret = amdgpu_ras_save_bad_pages(adev); out: mutex_unlock(&con->recovery_lock); - return 0; + return ret; } /* called when driver unload */ @@ -1427,33 +1493,11 @@ out: return 0; } -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{ - /* TODO - * write the array to eeprom when SMU disabled. - */ - return 0; -} - -/* - * read error record array in eeprom and reserve enough space for - * storing new bad pages - */ -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{ - struct eeprom_table_record *bps = NULL; - int ret; - - ret = amdgpu_ras_add_bad_pages(adev, bps, - adev->umc.max_ras_err_cnt_per_query); - - return ret; -} - static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data = &con->eh_data; + int ret; *data = kmalloc(sizeof(**data), GFP_KERNEL|__GFP_ZERO); @@ -1465,8 +1509,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - amdgpu_ras_load_bad_pages(adev); - amdgpu_ras_reserve_bad_pages(adev); + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); + if (ret) + return ret; + + if (adev->psp.ras.ras->eeprom_control.num_recs) { + ret = amdgpu_ras_load_bad_pages(adev); + if (ret) + return ret; + ret = amdgpu_ras_reserve_bad_pages(adev); + if (ret) + return ret; + } return 0; } @@ -1477,7 +1531,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct ras_err_handler_data *data = con->eh_data; cancel_work_sync(&con->recovery_work); - amdgpu_ras_save_bad_pages(adev); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); -- cgit v1.2.3 From 1a6fc071e1991321d3b6a00e0e7c733a462a4418 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 30 Aug 2019 19:50:39 +0800 Subject: drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place ras recovery_init should be called after ttm init, bad page reserve should be put in front of gpu reset since i2c may be unstable during gpu reset. add cleanup for recovery_init and recovery_fini v2: add more comment and print. remove cancel_work_sync in recovery_init. Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++++ 4 files changed, 43 insertions(+), 18 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3ad034aa0e3c..3268291babf8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, break; } } - - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - amdgpu_ras_reserve_bad_pages(tmp_adev); - } } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 53540e067d15..e9bd40ea7ce0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1493,16 +1493,17 @@ out: return 0; } -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data = &con->eh_data; int ret; - *data = kmalloc(sizeof(**data), - GFP_KERNEL|__GFP_ZERO); - if (!*data) - return -ENOMEM; + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); + if (!*data) { + ret = -ENOMEM; + goto out; + } mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); @@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); if (ret) - return ret; + goto free; if (adev->psp.ras.ras->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) - return ret; + goto free; ret = amdgpu_ras_reserve_bad_pages(adev); if (ret) - return ret; + goto release; } return 0; + +release: + amdgpu_ras_release_bad_pages(adev); +free: + con->eh_data = NULL; + kfree((*data)->bps); + kfree((*data)->bps_bo); + kfree(*data); +out: + DRM_WARN("Failed to initialize ras recovery!\n"); + + return ret; } static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; + /* recovery_init failed to init it, fini is useless */ + if (!data) + return 0; + cancel_work_sync(&con->recovery_work); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); + kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) return r; } - if (amdgpu_ras_recovery_init(adev)) - goto recovery_out; - amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; if (amdgpu_ras_fs_init(adev)) @@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) con->hw_supported, con->supported); return 0; fs_out: - amdgpu_ras_recovery_fini(adev); -recovery_out: amdgpu_ras_set_context(adev, NULL); kfree(con); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 96210e18191e..012034d2ae06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ras && (ras->supported & (1 << block)); } +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, unsigned int block); @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* save bad page to eeprom before gpu reset, + * i2c may be unstable in gpu reset + */ + amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) schedule_work(&ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 13b144c8f67d..54e6dacc34a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -54,6 +54,7 @@ #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" #include "amdgpu_sdma.h" +#include "amdgpu_ras.h" #include "bif/bif_4_1_d.h" static int amdgpu_map_buffer(struct ttm_buffer_object *bo, @@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) adev->gmc.visible_vram_size); #endif + /* + * retired pages will be loaded from eeprom and reserved here, + * it should be called after ttm init since new bo may be created, + * recovery_init may fail, but it can free all resources allocated by + * itself and its failure should not stop amdgpu init process. + * + * Note: theoretically, this should be called before all vram allocations + * to protect retired page from abusing + */ + amdgpu_ras_recovery_init(adev); + /* *The reserved vram for firmware must be pinned to the specified *place on the VRAM, so reserve it early. -- cgit v1.2.3 From 4d1337d2e96715b7882fd75658e54ab59b3f58cd Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 6 Sep 2019 17:23:44 -0400 Subject: drm/amdgpu: Avoid RAS recovery init when no RAS support. Fixes driver load regression on APUs. Signed-off-by: Andrey Grodzovsky Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e9bd40ea7ce0..e461386d697b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1496,9 +1496,14 @@ out: int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data **data = &con->eh_data; + struct ras_err_handler_data **data; int ret; + if (con) + data = &con->eh_data; + else + return 0; + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); if (!*data) { ret = -ENOMEM; -- cgit v1.2.3 From 4930aabe7c4e72cc62eca2a800e72eee17f40430 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 5 Sep 2019 19:25:18 +0800 Subject: drm/amdgpu: move umc ras init to umc block move umc ras init from ras module to umc block, generic ras module should pay less attention to specific ras block. Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index e461386d697b..3268f5453eb7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1655,10 +1655,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (amdgpu_ras_fs_init(adev)) goto fs_out; - /* ras init for each ras block */ - if (adev->umc.funcs->ras_init) - adev->umc.funcs->ras_init(adev); - DRM_INFO("RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index c8de127097ab..5683c51710aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -62,6 +62,10 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, void *ras_ih_info) goto free; } + /* ras init of specific umc version */ + if (adev->umc.funcs && adev->umc.funcs->ras_init) + adev->umc.funcs->ras_init(adev); + return 0; late_fini: -- cgit v1.2.3 From 084fe13b2c232c8f3e6e926597dd6daf1fda4d1b Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Mon, 9 Sep 2019 16:00:56 -0400 Subject: drm/amdgpu: Allow to reset to EERPOM table. The table grows quickly during debug/development effort when multiple RAS errors are injected. Allow to avoid this by setting table header back to empty if needed. v2: Switch to debugfs entry instead of load time parameter. Signed-off-by: Andrey Grodzovsky Reviewed-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3268f5453eb7..f3cbe4dc4748 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -305,6 +305,22 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * return size; } +/** + * DOC: AMDGPU RAS debugfs EEPROM table reset interface + * + * Usage: echo 1 > ../ras/ras_eeprom_reset will reset EEPROM table to 0 entries. + */ +static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, + size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + int ret; + + ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control); + + return ret == 1 ? size : -EIO; +} + static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { .owner = THIS_MODULE, .read = NULL, @@ -312,6 +328,13 @@ static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { .llseek = default_llseek }; +static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { + .owner = THIS_MODULE, + .read = NULL, + .write = amdgpu_ras_debugfs_eeprom_write, + .llseek = default_llseek +}; + static ssize_t amdgpu_ras_sysfs_read(struct device *dev, struct device_attribute *attr, char *buf) { @@ -953,6 +976,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) con->dir = debugfs_create_dir("ras", minor->debugfs_root); con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, adev, &amdgpu_ras_debugfs_ctrl_ops); + con->ent = debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_eeprom_ops); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, -- cgit v1.2.3 From f31703528863d6748631c953426a75450f250850 Mon Sep 17 00:00:00 2001 From: Hawking Zhang Date: Sun, 8 Sep 2019 09:09:15 +0800 Subject: drm/amdgpu: enable error injection to XGMI block via debugfs allow inject error to XGMI block via debugfs node ras_ctrl Signed-off-by: Hawking Zhang Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index f3cbe4dc4748..994d2b34094c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -704,6 +704,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, break; case AMDGPU_RAS_BLOCK__UMC: case AMDGPU_RAS_BLOCK__MMHUB: + case AMDGPU_RAS_BLOCK__XGMI_WAFL: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; default: -- cgit v1.2.3 From d7bd680d4047921b18fd90480c577a38178ed09d Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Wed, 11 Sep 2019 11:07:15 +0800 Subject: drm/amdgpu: support pcie bif ras query and inject Call pcie bif ras query/inject in amdgpu ras. Signed-off-by: Tao Zhou Signed-off-by: Guchun Chen Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 994d2b34094c..faf6863ca785 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -656,6 +656,10 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, if (adev->mmhub_funcs->query_ras_error_count) adev->mmhub_funcs->query_ras_error_count(adev, &err_data); break; + case AMDGPU_RAS_BLOCK__PCIE_BIF: + if (adev->nbio.funcs->query_ras_error_count) + adev->nbio.funcs->query_ras_error_count(adev, &err_data); + break; default: break; } @@ -705,6 +709,7 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__UMC: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__XGMI_WAFL: + case AMDGPU_RAS_BLOCK__PCIE_BIF: ret = psp_ras_trigger_error(&adev->psp, &block_info); break; default: -- cgit v1.2.3 From 012dd14d1de63b7443a8cec4ad9046c85a7184a5 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 16 Sep 2019 13:42:46 +0800 Subject: drm/amdgpu: fix ras ctrl debugfs node leak MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use debugfs_remove_recursive to remove the whole debugfs directory instead of removing the node one by one. Signed-off-by: Guchun Chen Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 12 +++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 -- 2 files changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index faf6863ca785..daf9ac0b711f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -980,10 +980,10 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) struct drm_minor *minor = adev->ddev->primary; con->dir = debugfs_create_dir("ras", minor->debugfs_root); - con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_ctrl_ops); - con->ent = debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, - adev, &amdgpu_ras_debugfs_eeprom_ops); + debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_ctrl_ops); + debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_eeprom_ops); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, @@ -1028,10 +1028,8 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_remove(adev, &obj->head); } - debugfs_remove(con->ent); - debugfs_remove(con->dir); + debugfs_remove_recursive(con->dir); con->dir = NULL; - con->ent = NULL; } /* debugfs end */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index dd5da3c6327e..ae386c466c0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -317,8 +317,6 @@ struct amdgpu_ras { struct list_head head; /* debugfs */ struct dentry *dir; - /* debugfs ctrl */ - struct dentry *ent; /* sysfs */ struct device_attribute features_attr; struct bin_attribute badpages_attr; -- cgit v1.2.3 From de7b45babd9be25138ff5e4a0c34eefffbb226ff Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 13 Sep 2019 13:43:15 +0200 Subject: drm/amdgpu: cleanup creating BOs at fixed location (v2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The placement is something TTM/BO internal and the RAS code should avoid touching that directly. Add a helper to create a BO at a fixed location and use that instead. v2: squash in fixes (Alex) Signed-off-by: Christian König Reviewed-by: Guchun Chen Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 61 +++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 85 +++--------------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 82 +++++----------------------- 4 files changed, 83 insertions(+), 148 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 162e3849ff88..12d2adcdf14e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -342,6 +342,67 @@ int amdgpu_bo_create_kernel(struct amdgpu_device *adev, return 0; } +/** + * amdgpu_bo_create_kernel_at - create BO for kernel use at specific location + * + * @adev: amdgpu device object + * @offset: offset of the BO + * @size: size of the BO + * @domain: where to place it + * @bo_ptr: used to initialize BOs in structures + * @cpu_addr: optional CPU address mapping + * + * Creates a kernel BO at a specific offset in the address space of the domain. + * + * Returns: + * 0 on success, negative error code otherwise. + */ +int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, + uint64_t offset, uint64_t size, uint32_t domain, + struct amdgpu_bo **bo_ptr, void **cpu_addr) +{ + struct ttm_operation_ctx ctx = { false, false }; + unsigned int i; + int r; + + offset &= PAGE_MASK; + size = ALIGN(size, PAGE_SIZE); + + r = amdgpu_bo_create_reserved(adev, size, PAGE_SIZE, domain, bo_ptr, + NULL, NULL); + if (r) + return r; + + /* + * Remove the original mem node and create a new one at the request + * position. + */ + for (i = 0; i < (*bo_ptr)->placement.num_placement; ++i) { + (*bo_ptr)->placements[i].fpfn = offset >> PAGE_SHIFT; + (*bo_ptr)->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; + } + + ttm_bo_mem_put(&(*bo_ptr)->tbo, &(*bo_ptr)->tbo.mem); + r = ttm_bo_mem_space(&(*bo_ptr)->tbo, &(*bo_ptr)->placement, + &(*bo_ptr)->tbo.mem, &ctx); + if (r) + goto error; + + if (cpu_addr) { + r = amdgpu_bo_kmap(*bo_ptr, cpu_addr); + if (r) + goto error; + } + + amdgpu_bo_unreserve(*bo_ptr); + return 0; + +error: + amdgpu_bo_unreserve(*bo_ptr); + amdgpu_bo_unref(bo_ptr); + return r; +} + /** * amdgpu_bo_free_kernel - free BO for kernel use * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index 314190c2c5c2..7e99f6c58c48 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -238,6 +238,9 @@ int amdgpu_bo_create_kernel(struct amdgpu_device *adev, unsigned long size, int align, u32 domain, struct amdgpu_bo **bo_ptr, u64 *gpu_addr, void **cpu_addr); +int amdgpu_bo_create_kernel_at(struct amdgpu_device *adev, + uint64_t offset, uint64_t size, uint32_t domain, + struct amdgpu_bo **bo_ptr, void **cpu_addr); void amdgpu_bo_free_kernel(struct amdgpu_bo **bo, u64 *gpu_addr, void **cpu_addr); int amdgpu_bo_kmap(struct amdgpu_bo *bo, void **ptr); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index daf9ac0b711f..62afdfce3751 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -71,12 +71,6 @@ const char *ras_block_string[] = { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); -static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, - struct amdgpu_bo **bo_ptr); -static int amdgpu_ras_release_vram(struct amdgpu_device *adev, - struct amdgpu_bo **bo_ptr); - static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -1260,75 +1254,6 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) atomic_set(&ras->in_recovery, 0); } -static int amdgpu_ras_release_vram(struct amdgpu_device *adev, - struct amdgpu_bo **bo_ptr) -{ - /* no need to free it actually. */ - amdgpu_bo_free_kernel(bo_ptr, NULL, NULL); - return 0; -} - -/* reserve vram with size@offset */ -static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, - uint64_t offset, uint64_t size, - struct amdgpu_bo **bo_ptr) -{ - struct ttm_operation_ctx ctx = { false, false }; - struct amdgpu_bo_param bp; - int r = 0; - int i; - struct amdgpu_bo *bo; - - if (bo_ptr) - *bo_ptr = NULL; - memset(&bp, 0, sizeof(bp)); - bp.size = size; - bp.byte_align = PAGE_SIZE; - bp.domain = AMDGPU_GEM_DOMAIN_VRAM; - bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | - AMDGPU_GEM_CREATE_NO_CPU_ACCESS; - bp.type = ttm_bo_type_kernel; - bp.resv = NULL; - - r = amdgpu_bo_create(adev, &bp, &bo); - if (r) - return -EINVAL; - - r = amdgpu_bo_reserve(bo, false); - if (r) - goto error_reserve; - - offset = ALIGN(offset, PAGE_SIZE); - for (i = 0; i < bo->placement.num_placement; ++i) { - bo->placements[i].fpfn = offset >> PAGE_SHIFT; - bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; - } - - ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); - r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx); - if (r) - goto error_pin; - - r = amdgpu_bo_pin_restricted(bo, - AMDGPU_GEM_DOMAIN_VRAM, - offset, - offset + size); - if (r) - goto error_pin; - - if (bo_ptr) - *bo_ptr = bo; - - amdgpu_bo_unreserve(bo); - return r; - -error_pin: - amdgpu_bo_unreserve(bo); -error_reserve: - amdgpu_bo_unref(&bo); - return r; -} - /* alloc/realloc bps array */ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, struct ras_err_handler_data *data, int pages) @@ -1464,7 +1389,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; uint64_t bp; - struct amdgpu_bo *bo; + struct amdgpu_bo *bo = NULL; int i, ret = 0; if (!con || !con->eh_data) @@ -1478,12 +1403,14 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) for (i = data->last_reserved; i < data->count; i++) { bp = data->bps[i].retired_page; - if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, - PAGE_SIZE, &bo)) + if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE, + AMDGPU_GEM_DOMAIN_VRAM, + &bo, NULL)) DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); data->bps_bo[i] = bo; data->last_reserved = i + 1; + bo = NULL; } /* continue to save bad pages to eeprom even reesrve_vram fails */ @@ -1512,7 +1439,7 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) for (i = data->last_reserved - 1; i >= 0; i--) { bo = data->bps_bo[i]; - amdgpu_ras_release_vram(adev, &bo); + amdgpu_bo_free_kernel(&bo, NULL, NULL); data->bps_bo[i] = bo; data->last_reserved = i; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 54e6dacc34a4..68c541e11189 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1650,81 +1650,25 @@ static void amdgpu_ttm_fw_reserve_vram_fini(struct amdgpu_device *adev) */ static int amdgpu_ttm_fw_reserve_vram_init(struct amdgpu_device *adev) { - struct ttm_operation_ctx ctx = { false, false }; - struct amdgpu_bo_param bp; - int r = 0; - int i; - u64 vram_size = adev->gmc.visible_vram_size; - u64 offset = adev->fw_vram_usage.start_offset; - u64 size = adev->fw_vram_usage.size; - struct amdgpu_bo *bo; - - memset(&bp, 0, sizeof(bp)); - bp.size = adev->fw_vram_usage.size; - bp.byte_align = PAGE_SIZE; - bp.domain = AMDGPU_GEM_DOMAIN_VRAM; - bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED | - AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS; - bp.type = ttm_bo_type_kernel; - bp.resv = NULL; + uint64_t vram_size = adev->gmc.visible_vram_size; + int r; + adev->fw_vram_usage.va = NULL; adev->fw_vram_usage.reserved_bo = NULL; - if (adev->fw_vram_usage.size > 0 && - adev->fw_vram_usage.size <= vram_size) { - - r = amdgpu_bo_create(adev, &bp, - &adev->fw_vram_usage.reserved_bo); - if (r) - goto error_create; - - r = amdgpu_bo_reserve(adev->fw_vram_usage.reserved_bo, false); - if (r) - goto error_reserve; - - /* remove the original mem node and create a new one at the - * request position - */ - bo = adev->fw_vram_usage.reserved_bo; - offset = ALIGN(offset, PAGE_SIZE); - for (i = 0; i < bo->placement.num_placement; ++i) { - bo->placements[i].fpfn = offset >> PAGE_SHIFT; - bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT; - } - - ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem); - r = ttm_bo_mem_space(&bo->tbo, &bo->placement, - &bo->tbo.mem, &ctx); - if (r) - goto error_pin; - - r = amdgpu_bo_pin_restricted(adev->fw_vram_usage.reserved_bo, - AMDGPU_GEM_DOMAIN_VRAM, - adev->fw_vram_usage.start_offset, - (adev->fw_vram_usage.start_offset + - adev->fw_vram_usage.size)); - if (r) - goto error_pin; - r = amdgpu_bo_kmap(adev->fw_vram_usage.reserved_bo, - &adev->fw_vram_usage.va); - if (r) - goto error_kmap; - - amdgpu_bo_unreserve(adev->fw_vram_usage.reserved_bo); - } - return r; + if (adev->fw_vram_usage.size == 0 || + adev->fw_vram_usage.size > vram_size) + return 0; -error_kmap: - amdgpu_bo_unpin(adev->fw_vram_usage.reserved_bo); -error_pin: - amdgpu_bo_unreserve(adev->fw_vram_usage.reserved_bo); -error_reserve: - amdgpu_bo_unref(&adev->fw_vram_usage.reserved_bo); -error_create: - adev->fw_vram_usage.va = NULL; - adev->fw_vram_usage.reserved_bo = NULL; + return amdgpu_bo_create_kernel_at(adev, + adev->fw_vram_usage.start_offset, + adev->fw_vram_usage.size, + AMDGPU_GEM_DOMAIN_VRAM, + &adev->fw_vram_usage.reserved_bo, + &adev->fw_vram_usage.va); return r; } + /** * amdgpu_ttm_init - Init the memory management (ttm) as well as various * gtt/vram related fields. -- cgit v1.2.3 From 879e723df3cd0bfda4a036477b1359563406dbca Mon Sep 17 00:00:00 2001 From: Adam Zerella Date: Sat, 14 Sep 2019 22:56:16 +1000 Subject: docs: drm/amdgpu: Resolve build warnings Some of the documentation formatting could be improved which will resolve some Sphinx amdgpu build warnings e.g WARNING: Unexpected indentation. WARNING: Block quote ends without a blank line; unexpected unindent. WARNING: Inline emphasis start-string without end-string. Signed-off-by: Adam Zerella Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 4 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++++++++-------------- 3 files changed, 35 insertions(+), 24 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0e86460daed2..86ea4c6f44b4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -245,10 +245,13 @@ module_param_named(msi, amdgpu_msi, int, 0444); * * The format can be [Non-Compute] or [GFX,Compute,SDMA,Video]. That is there can be one or * multiple values specified. 0 and negative values are invalidated. They will be adjusted - * to default timeout. - * - With one value specified, the setting will apply to all non-compute jobs. - * - With multiple values specified, the first one will be for GFX. The second one is for Compute. - * And the third and fourth ones are for SDMA and Video. + * to the default timeout. + * + * - With one value specified, the setting will apply to all non-compute jobs. + * - With multiple values specified, the first one will be for GFX. + * The second one is for Compute. The third and fourth ones are + * for SDMA and Video. + * * By default(with no lockup_timeout settings), the timeout for all non-compute(GFX, SDMA and Video) * jobs is 10000. And there is no timeout enforced on compute jobs. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index 03930313c263..bdad0347fdd8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -2196,9 +2196,9 @@ static ssize_t amdgpu_hwmon_show_mclk_label(struct device *dev, * * - fan1_input: fan speed in RPM * - * - fan[1-*]_target: Desired fan speed Unit: revolution/min (RPM) + * - fan[1-\*]_target: Desired fan speed Unit: revolution/min (RPM) * - * - fan[1-*]_enable: Enable or disable the sensors.1: Enable 0: Disable + * - fan[1-\*]_enable: Enable or disable the sensors.1: Enable 0: Disable * * hwmon interfaces for GPU clocks: * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 62afdfce3751..76a0c9917eeb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -216,29 +216,36 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * * Second member: struct ras_debug_if::op. * It has three kinds of operations. - * 0: disable RAS on the block. Take ::head as its data. - * 1: enable RAS on the block. Take ::head as its data. - * 2: inject errors on the block. Take ::inject as its data. + * + * - 0: disable RAS on the block. Take ::head as its data. + * - 1: enable RAS on the block. Take ::head as its data. + * - 2: inject errors on the block. Take ::inject as its data. * * How to use the interface? * programs: * copy the struct ras_debug_if in your codes and initialize it. * write the struct to the control node. * - * bash: - * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl - * op: disable, enable, inject - * disable: only block is needed - * enable: block and error are needed - * inject: error, address, value are needed - * block: umc, smda, gfx, ......... - * see ras_block_string[] for details - * error: ue, ce - * ue: multi_uncorrectable - * ce: single_correctable - * sub_block: sub block index, pass 0 if there is no sub block + * .. code-block:: bash + * + * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl + * + * op: disable, enable, inject + * disable: only block is needed + * enable: block and error are needed + * inject: error, address, value are needed + * block: umc, smda, gfx, ......... + * see ras_block_string[] for details + * error: ue, ce + * ue: multi_uncorrectable + * ce: single_correctable + * sub_block: + * sub block index, pass 0 if there is no sub block + * + * here are some examples for bash commands: + * + * .. code-block:: bash * - * here are some examples for bash commands, * echo inject umc ue 0x0 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo inject umc ce 0 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl * echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl @@ -251,8 +258,9 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * For inject, please check corresponding err count at * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count * - * NOTE: operation is only allowed on blocks which are supported. - * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask + * .. note:: + * Operation is only allowed on blocks which are supported. + * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask */ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) -- cgit v1.2.3 From ae115c81ecd379aed20715c3ff98ee9f651acfce Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 12 Sep 2019 18:57:23 +0800 Subject: drm/amdgpu: replace DRM_ERROR with DRM_WARN in ras_reserve_bad_pages There are two cases of reserve error should be ignored: 1) a ras bad page has been allocated (used by someone); 2) a ras bad page has been reserved (duplicate error injection for one page); DRM_ERROR is unnecessary for the failure of bad page reserve Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 76a0c9917eeb..a36df02b61ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1411,10 +1411,15 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) for (i = data->last_reserved; i < data->count; i++) { bp = data->bps[i].retired_page; + /* There are two cases of reserve error should be ignored: + * 1) a ras bad page has been allocated (used by someone); + * 2) a ras bad page has been reserved (duplicate error injection + * for one page); + */ if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, &bo, NULL)) - DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); + DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); data->bps_bo[i] = bo; data->last_reserved = i + 1; -- cgit v1.2.3 From a142ba8800dd12de1a9013477efa8ab692041ed3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Sep 2019 08:11:24 -0500 Subject: drm/amdgpu/ras: use GPU PAGE_SIZE/SHIFT for reserving pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We are reserving vram pages so they should be aligned to the GPU page size. Reviewed-by: Tao Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index a36df02b61ea..24fecaae415c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1416,7 +1416,8 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) * 2) a ras bad page has been reserved (duplicate error injection * for one page); */ - if (amdgpu_bo_create_kernel_at(adev, bp << PAGE_SHIFT, PAGE_SIZE, + if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, + AMDGPU_GPU_PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM, &bo, NULL)) DRM_WARN("RAS WARN: reserve vram for retired page %llx fail\n", bp); -- cgit v1.2.3 From 8a3e801f1924c07781e93f043a74cfa244451787 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Tue, 17 Sep 2019 17:49:29 +0800 Subject: drm/amdgpu: avoid null pointer dereference null ptr should be checked first to avoid null ptr access Signed-off-by: Guchun Chen Reviewed-by: Tao Zhou Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 24fecaae415c..83b681a16e56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1336,13 +1336,13 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; - struct amdgpu_ras_eeprom_control *control = - &adev->psp.ras.ras->eeprom_control; + struct amdgpu_ras_eeprom_control *control; int save_count; if (!con || !con->eh_data) return 0; + control = &con->eeprom_control; data = con->eh_data; save_count = data->count - control->num_recs; /* only new entries are saved */ -- cgit v1.2.3 From f77c7109c071edc148dc6c71dcf927059516235f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 19 Sep 2019 15:09:56 -0500 Subject: drm/amdgpu/ras: fix and update the documentation for RAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add new sections to amdgpu.rst, fix up formatting issues, add additional documentation to each section. Acked-by: Christian König Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu.rst | 24 +++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 53 ++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 9 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index 80db5d89cd49..5b9eaf23558e 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -79,12 +79,32 @@ AMDGPU XGMI Support .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c :internal: -AMDGPU RAS debugfs control interface -==================================== +AMDGPU RAS Support +================== + +RAS debugfs/sysfs Control and Error Injection Interfaces +-------------------------------------------------------- .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c :doc: AMDGPU RAS debugfs control interface +RAS Error Count sysfs Interface +------------------------------- + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS sysfs Error Count Interface + +RAS EEPROM debugfs Interface +---------------------------- + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS debugfs EEPROM table reset interface + +RAS VRAM Bad Pages sysfs Interface +---------------------------------- + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS sysfs gpu_vram_bad_pages Interface .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c :internal: diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 83b681a16e56..14f3f8d831d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -310,7 +310,18 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /** * DOC: AMDGPU RAS debugfs EEPROM table reset interface * - * Usage: echo 1 > ../ras/ras_eeprom_reset will reset EEPROM table to 0 entries. + * Some boards contain an EEPROM which is used to persistently store a list of + * bad pages containing ECC errors detected in vram. This interface provides + * a way to reset the EEPROM, e.g., after testing error injection. + * + * Usage: + * + * .. code-block:: bash + * + * echo 1 > ../ras/ras_eeprom_reset + * + * will reset EEPROM table to 0 entries. + * */ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) @@ -337,6 +348,27 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { .llseek = default_llseek }; +/** + * DOC: AMDGPU RAS sysfs Error Count Interface + * + * It allows user to read the error count for each IP block on the gpu through + * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count + * + * It outputs the multiple lines which report the uncorrected (ue) and corrected + * (ce) error counts. + * + * The format of one line is below, + * + * [ce|ue]: count + * + * Example: + * + * .. code-block:: bash + * + * ue: 0 + * ce: 1 + * + */ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, struct device_attribute *attr, char *buf) { @@ -781,8 +813,8 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) }; } -/* - * DOC: ras sysfs gpu_vram_bad_pages interface +/** + * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface * * It allows user to read the bad pages of vram on the gpu through * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages @@ -794,14 +826,21 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) * * gpu pfn and gpu page size are printed in hex format. * flags can be one of below character, + * * R: reserved, this gpu page is reserved and not able to use. + * * P: pending for reserve, this gpu page is marked as bad, will be reserved - * in next window of page_reserve. + * in next window of page_reserve. + * * F: unable to reserve. this gpu page can't be reserved due to some reasons. * - * examples: - * 0x00000001 : 0x00001000 : R - * 0x00000002 : 0x00001000 : P + * Examples: + * + * .. code-block:: bash + * + * 0x00000001 : 0x00001000 : R + * 0x00000002 : 0x00001000 : P + * */ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, -- cgit v1.2.3 From d65bf1f8a795e2748ab3ea2231ab896a9cac743c Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Thu, 12 Sep 2019 17:12:21 +0800 Subject: drm/amdgpu: replace mmhub_funcs with mmhub.funcs remove mmhub_funcs in adev Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 - drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index b29d4a34812e..578cd81814a0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -925,7 +925,6 @@ struct amdgpu_device { uint32_t *reg_offset[MAX_HWIP][HWIP_MAX_INSTANCE]; const struct amdgpu_df_funcs *df_funcs; - const struct amdgpu_mmhub_funcs *mmhub_funcs; /* delayed work_func for deferring clockgating during resume */ struct delayed_work delayed_init_work; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 14f3f8d831d8..00cd01c61668 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -687,8 +687,8 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, adev->gfx.funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub_funcs->query_ras_error_count) - adev->mmhub_funcs->query_ras_error_count(adev, &err_data); + if (adev->mmhub.funcs->query_ras_error_count) + adev->mmhub.funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: if (adev->nbio.funcs->query_ras_error_count) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 40dad08ed1fa..b7cc512941ce 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -655,7 +655,7 @@ static void gmc_v9_0_set_mmhub_funcs(struct amdgpu_device *adev) { switch (adev->asic_type) { case CHIP_VEGA20: - adev->mmhub_funcs = &mmhub_v1_0_funcs; + adev->mmhub.funcs = &mmhub_v1_0_funcs; break; default: break; @@ -750,8 +750,8 @@ static int gmc_v9_0_ecc_late_init(void *handle) return r; } - if (adev->mmhub_funcs && adev->mmhub_funcs->ras_late_init) { - r = adev->mmhub_funcs->ras_late_init(adev); + if (adev->mmhub.funcs && adev->mmhub.funcs->ras_late_init) { + r = adev->mmhub.funcs->ras_late_init(adev); if (r) return r; } -- cgit v1.2.3 From 0771b0bf0790295b141cc30644a1b0b3e22a331e Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Wed, 18 Sep 2019 15:26:23 +0800 Subject: drm/amdgpu: simplify the access to eeprom_control struct simplify the code of accessing to eeprom_control struct Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 00cd01c61668..486568ded6d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1386,7 +1386,7 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) save_count = data->count - control->num_recs; /* only new entries are saved */ if (save_count > 0) - if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control, + if (amdgpu_ras_eeprom_process_recods(control, &data->bps[control->num_recs], true, save_count)) { @@ -1524,11 +1524,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); + ret = amdgpu_ras_eeprom_init(&con->eeprom_control); if (ret) goto free; - if (adev->psp.ras.ras->eeprom_control.num_recs) { + if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) goto free; -- cgit v1.2.3 From 1995b3a35fecbf8a8078a5c3ff4b4a88eddaa5fe Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Thu, 3 Oct 2019 17:54:57 -0400 Subject: drm/amdgpu: Fix error handling in amdgpu_ras_recovery_init Don't set a struct pointer to NULL before freeing its members. It's hard to see what's happening due to a local pointer-to-pointer data aliasing con->eh_data. Signed-off-by: Felix Kuehling Tested-by: Philip Cox Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 486568ded6d6..0e2ee5869b5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1542,10 +1542,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) release: amdgpu_ras_release_bad_pages(adev); free: - con->eh_data = NULL; kfree((*data)->bps); kfree((*data)->bps_bo); kfree(*data); + con->eh_data = NULL; out: DRM_WARN("Failed to initialize ras recovery!\n"); -- cgit v1.2.3 From a20bfd0fd41fd0570d0b395d1d86e5578adac4d4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 8 Oct 2019 13:04:33 -0500 Subject: drm/amdgpu/ras: fix typos in documentation Fix a couple of spelling typos. Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0e2ee5869b5f..c0d3edf77901 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -228,13 +228,13 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * * .. code-block:: bash * - * echo op block [error [sub_blcok address value]] > .../ras/ras_ctrl + * echo op block [error [sub_block address value]] > .../ras/ras_ctrl * * op: disable, enable, inject * disable: only block is needed * enable: block and error are needed * inject: error, address, value are needed - * block: umc, smda, gfx, ......... + * block: umc, sdma, gfx, ......... * see ras_block_string[] for details * error: ue, ce * ue: multi_uncorrectable -- cgit v1.2.3 From 54e9ab2edb2576b58d905d2385a0cbfc0750c7e4 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 8 Oct 2019 13:08:30 -0500 Subject: drm/amdgpu/ras: document the reboot ras option We recently added it, but never documented it. Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index c0d3edf77901..84d8c3342a81 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -215,11 +215,12 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * value to the address. * * Second member: struct ras_debug_if::op. - * It has three kinds of operations. + * It has four kinds of operations. * * - 0: disable RAS on the block. Take ::head as its data. * - 1: enable RAS on the block. Take ::head as its data. * - 2: inject errors on the block. Take ::inject as its data. + * - 3: reboot on unrecoverable error * * How to use the interface? * programs: -- cgit v1.2.3 From 6e4be98767b249923e97b5a3c509e722d22e462d Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 30 Sep 2019 14:48:19 +0800 Subject: drm/amdgpu: avoid ras error injection for retired page check whether a page is bad page before umc error injection, bad page should not be accessed again Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 44 +++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 84d8c3342a81..49bdee9d9fad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -71,6 +71,9 @@ const char *ras_block_string[] = { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); +static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + uint64_t addr); + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -291,6 +294,14 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * break; } + /* umc ce/ue error injection for a bad page is not allowed */ + if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) && + amdgpu_ras_check_bad_page(adev, data.inject.address)) { + DRM_WARN("RAS WARN: 0x%llx has been marked as bad before error injection!\n", + data.inject.address); + break; + } + /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; @@ -1431,6 +1442,39 @@ out: return ret; } +/* + * check if an address belongs to bad page + * + * Note: this check is only for umc block + */ +static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + uint64_t addr) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + int i; + bool ret = false; + + if (!con || !con->eh_data) + return ret; + + mutex_lock(&con->recovery_lock); + data = con->eh_data; + if (!data) + goto out; + + addr >>= AMDGPU_GPU_PAGE_SHIFT; + for (i = 0; i < data->count; i++) + if (addr == data->bps[i].retired_page) { + ret = true; + goto out; + } + +out: + mutex_unlock(&con->recovery_lock); + return ret; +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { -- cgit v1.2.3 From ed606f8a346b9040008794a261b43902818efd7b Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 11 Oct 2019 10:32:59 -0400 Subject: dmr/amdgpu: Fix crash on SRIOV for ERREVENT_ATHUB_INTERRUPT interrupt. Ignre the ERREVENT_ATHUB_INTERRUPT for systems without RAS. Signed-off-by: Andrey Grodzovsky Reviewed-and-tested-by: Jack Zhang Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 49bdee9d9fad..6220394521e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1888,6 +1888,12 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { + uint32_t hw_supported, supported; + + amdgpu_ras_check_supported(adev, &hw_supported, &supported); + if (!hw_supported) + return; + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); -- cgit v1.2.3 From c688a06bc661749a944a2980f0ff0cee8659ac81 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Mon, 21 Oct 2019 16:56:00 +0800 Subject: drm/amdgpu: refine reboot debugfs operation in ras case (v3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ras reboot debugfs node allows user one easy control to avoid gpu recovery hang problem and directly reboot system per card basis, after ras uncorrectable error happens. However, it is one common entry, which should get rid of ras_ctrl node and remove ip dependence when inputting by user. So add one new auto_reboot node in ras debugfs dir to achieve this. v2: in commit mssage, add justification why ras reboot debugfs node is needed. v3: use debugfs_create_bool to create debugfs file for boolean value Signed-off-by: Guchun Chen Reviewed-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6220394521e4..2d9e13d2a71a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; - else if (sscanf(str, "reboot %32s", block_name) == 1) - op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -218,12 +216,11 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * value to the address. * * Second member: struct ras_debug_if::op. - * It has four kinds of operations. + * It has three kinds of operations. * * - 0: disable RAS on the block. Take ::head as its data. * - 1: enable RAS on the block. Take ::head as its data. * - 2: inject errors on the block. Take ::inject as its data. - * - 3: reboot on unrecoverable error * * How to use the interface? * programs: @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; - case 3: - amdgpu_ras_get_context(adev)->reboot = true; - break; default: ret = -EINVAL; break; @@ -1037,6 +1031,17 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) adev, &amdgpu_ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, adev, &amdgpu_ras_debugfs_eeprom_ops); + + /* + * After one uncorrectable error happens, usually GPU recovery will + * be scheduled. But due to the known problem in GPU recovery failing + * to bring GPU back, below interface provides one direct way to + * user to reboot system automatically in such case within + * ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery routine + * will never be called. + */ + debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, + &con->reboot); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, -- cgit v1.2.3 From 52dd95f2b642f58d22ff2564faabee56c8d943b8 Mon Sep 17 00:00:00 2001 From: Guchun Chen Date: Tue, 22 Oct 2019 11:39:25 +0800 Subject: drm/amdgpu: define macros for retire page reservation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Easy for maintainance. Signed-off-by: Guchun Chen Acked-by: Christian König Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 2d9e13d2a71a..796326b36e00 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -68,6 +68,11 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +enum amdgpu_ras_retire_page_reservation { + AMDGPU_RAS_RETIRE_PAGE_RESERVED, + AMDGPU_RAS_RETIRE_PAGE_PENDING, + AMDGPU_RAS_RETIRE_PAGE_FAULT, +}; atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); @@ -809,11 +814,11 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, static char *amdgpu_ras_badpage_flags_str(unsigned int flags) { switch (flags) { - case 0: + case AMDGPU_RAS_RETIRE_PAGE_RESERVED: return "R"; - case 1: + case AMDGPU_RAS_RETIRE_PAGE_PENDING: return "P"; - case 2: + case AMDGPU_RAS_RETIRE_PAGE_FAULT: default: return "F"; }; @@ -1294,13 +1299,13 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, (*bps)[i] = (struct ras_badpage){ .bp = data->bps[i].retired_page, .size = AMDGPU_GPU_PAGE_SIZE, - .flags = 0, + .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; if (data->last_reserved <= i) - (*bps)[i].flags = 1; + (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; else if (data->bps_bo[i] == NULL) - (*bps)[i].flags = 2; + (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } *count = data->count; -- cgit v1.2.3 From bff77e86a3776fab6859bc168ecda6ccf56bfbd2 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Fri, 25 Oct 2019 17:48:52 +0800 Subject: drm/amdgpu: bypass some cleanup work after err_event_athub (v2) PSP lost connection when err_event_athub occurs. These cleanup work can be skipped in BACO reset. v2: squash in missing include (Alex) Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++--------- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 ++++-- 4 files changed, 30 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c03089503b0f..d36d2b093539 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2271,6 +2271,12 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* displays are handled in phase1 */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) continue; + /* PSP lost connection when err_event_athub occurs */ + if (amdgpu_ras_intr_triggered() && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index fd7a73f4fa70..bbe9ac7e843f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -34,6 +34,8 @@ #include "psp_v11_0.h" #include "psp_v12_0.h" +#include "amdgpu_ras.h" + static void psp_set_funcs(struct amdgpu_device *adev); static int psp_early_init(void *handle) @@ -167,6 +169,13 @@ psp_cmd_submit_buf(struct psp_context *psp, while (*((unsigned int *)psp->fence_buf) != index) { if (--timeout == 0) break; + /* + * Shouldn't wait for timeout when err_event_athub occurs, + * because gpu reset thread triggered and lock resource should + * be released for psp resume sequence. + */ + if (amdgpu_ras_intr_triggered()) + break; msleep(1); amdgpu_asic_invalidate_hdp(psp->adev, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 796326b36e00..dab90c280476 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -558,15 +558,17 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) return 0; - ret = psp_ras_enable_features(&adev->psp, &info, enable); - if (ret) { - DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", - enable ? "enable":"disable", - ras_block_str(head->block), - ret); - if (ret == TA_RAS_STATUS__RESET_NEEDED) - return -EAGAIN; - return -EINVAL; + if (!amdgpu_ras_intr_triggered()) { + ret = psp_ras_enable_features(&adev->psp, &info, enable); + if (ret) { + DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", + enable ? "enable":"disable", + ras_block_str(head->block), + ret); + if (ret == TA_RAS_STATUS__RESET_NEEDED) + return -EAGAIN; + return -EINVAL; + } } /* setup the obj */ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 088c6a734a1a..d694be9a8c39 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3736,8 +3736,10 @@ static int gfx_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); - /* disable KCQ to avoid CPC touch memory not valid anymore */ - gfx_v9_0_kcq_disable(adev); + /* DF freeze and kcq disable will fail */ + if (!amdgpu_ras_intr_triggered()) + /* disable KCQ to avoid CPC touch memory not valid anymore */ + gfx_v9_0_kcq_disable(adev); if (amdgpu_sriov_vf(adev)) { gfx_v9_0_cp_gfx_enable(adev, false); -- cgit v1.2.3 From ef177d11d696fe0eea36ef74edfeaeb62cd89c35 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Oct 2019 14:40:09 -0400 Subject: drm/amdgpu: Improve RAS documentation (v2) Clarify some areas, clean up formatting, add section for unrecoverable error handling. v2: fix grammatical errors Reviewed-by: Yong Zhao Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu.rst | 35 +++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 40 +++++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index 5b9eaf23558e..0efede580039 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -82,12 +82,21 @@ AMDGPU XGMI Support AMDGPU RAS Support ================== +The AMDGPU RAS interfaces are exposed via sysfs (for informational queries) and +debugfs (for error injection). + RAS debugfs/sysfs Control and Error Injection Interfaces -------------------------------------------------------- .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c :doc: AMDGPU RAS debugfs control interface +RAS Reboot Behavior for Unrecoverable Errors +-------------------------------------------------------- + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS Reboot Behavior for Unrecoverable Errors + RAS Error Count sysfs Interface ------------------------------- @@ -109,6 +118,32 @@ RAS VRAM Bad Pages sysfs Interface .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c :internal: +Sample Code +----------- +Sample code for testing error injection can be found here: +https://cgit.freedesktop.org/mesa/drm/tree/tests/amdgpu/ras_tests.c + +This is part of the libdrm amdgpu unit tests which cover several areas of the GPU. +There are four sets of tests: + +RAS Basic Test + +The test verifies the RAS feature enabled status and makes sure the necessary sysfs and debugfs files +are present. + +RAS Query Test + +This test checks the RAS availability and enablement status for each supported IP block as well as +the error counts. + +RAS Inject Test + +This test injects errors for each IP. + +RAS Disable Test + +This test tests disabling of RAS features for each IP block. + GPU Power/Thermal Controls and Monitoring ========================================= diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index dab90c280476..404483437bd3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -220,7 +220,7 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * As their names indicate, inject operation will write the * value to the address. * - * Second member: struct ras_debug_if::op. + * The second member: struct ras_debug_if::op. * It has three kinds of operations. * * - 0: disable RAS on the block. Take ::head as its data. @@ -228,14 +228,20 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * - 2: inject errors on the block. Take ::inject as its data. * * How to use the interface? - * programs: - * copy the struct ras_debug_if in your codes and initialize it. - * write the struct to the control node. + * + * Programs + * + * Copy the struct ras_debug_if in your codes and initialize it. + * Write the struct to the control node. + * + * Shells * * .. code-block:: bash * * echo op block [error [sub_block address value]] > .../ras/ras_ctrl * + * Parameters: + * * op: disable, enable, inject * disable: only block is needed * enable: block and error are needed @@ -265,8 +271,10 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count * * .. note:: - * Operation is only allowed on blocks which are supported. + * Operations are only allowed on blocks which are supported. * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask + * to see which blocks support RAS on a particular asic. + * */ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) @@ -322,7 +330,7 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * * DOC: AMDGPU RAS debugfs EEPROM table reset interface * * Some boards contain an EEPROM which is used to persistently store a list of - * bad pages containing ECC errors detected in vram. This interface provides + * bad pages which experiences ECC errors in vram. This interface provides * a way to reset the EEPROM, e.g., after testing error injection. * * Usage: @@ -362,7 +370,7 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { /** * DOC: AMDGPU RAS sysfs Error Count Interface * - * It allows user to read the error count for each IP block on the gpu through + * It allows the user to read the error count for each IP block on the gpu through * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count * * It outputs the multiple lines which report the uncorrected (ue) and corrected @@ -1027,6 +1035,24 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) } /* sysfs end */ +/** + * DOC: AMDGPU RAS Reboot Behavior for Unrecoverable Errors + * + * Normally when there is an uncorrectable error, the driver will reset + * the GPU to recover. However, in the event of an unrecoverable error, + * the driver provides an interface to reboot the system automatically + * in that event. + * + * The following file in debugfs provides that interface: + * /sys/kernel/debug/dri/[0/1/2...]/ras/auto_reboot + * + * Usage: + * + * .. code-block:: bash + * + * echo true > .../ras/auto_reboot + * + */ /* debugfs begin */ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { -- cgit v1.2.3