From 12ffa55da60f8355a5e485bc6d612257a303147e Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 30 Aug 2019 10:31:18 -0400 Subject: drm/amdgpu: Fix bugs in amdgpu_device_gpu_recover in XGMI case. Issue 1: In XGMI case amdgpu_device_lock_adev for other devices in hive was called to late, after access to their repsective schedulers. So relocate the lock to the begining of accessing the other devs. Issue 2: Using amdgpu_device_ip_need_full_reset to switch the device list from all devices in hive to the single 'master' device who owns this reset call is wrong because when stopping schedulers we iterate all the devices in hive but when restarting we will only reactivate the 'master' device. Also, in case amdgpu_device_pre_asic_reset conlcudes that full reset IS needed we then have to stop schedulers for all devices in hive and not only the 'master' but with amdgpu_device_ip_need_full_reset we already missed the opprotunity do to so. So just remove this logic and always stop and start all schedulers for all devices in hive. Also minor cleanup and print fix. v4: Minor coding style fix. Signed-off-by: Andrey Grodzovsky Acked-by: Felix Kuehling Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a1939dbd4e3..a4d38637cdc6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3823,15 +3823,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, device_list_handle = &device_list; } - /* - * Mark these ASICs to be reseted as untracked first - * And add them back after reset completed - */ - list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) - amdgpu_unregister_gpu_instance(tmp_adev); - /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + if (tmp_adev != adev) + amdgpu_device_lock_adev(tmp_adev, false); + /* + * Mark these ASICs to be reseted as untracked first + * And add them back after reset completed + */ + amdgpu_unregister_gpu_instance(tmp_adev); + /* disable ras on ALL IPs */ if (amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); @@ -3857,9 +3858,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, dma_fence_is_signaled(job->base.s_fence->parent)) job_signaled = true; - if (!amdgpu_device_ip_need_full_reset(adev)) - device_list_handle = &device_list; - if (job_signaled) { dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); goto skip_hw_reset; @@ -3881,7 +3879,6 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ if (tmp_adev == adev) continue; - amdgpu_device_lock_adev(tmp_adev, false); r = amdgpu_device_pre_asic_reset(tmp_adev, NULL, &need_full_reset); @@ -3930,10 +3927,10 @@ skip_hw_reset: if (r) { /* bad news, how to tell it to userspace ? */ - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter)); + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { - dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter)); + dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); } amdgpu_device_unlock_adev(tmp_adev); -- cgit v1.2.3 From 7c6e68c777f109484559a35b125a773439bbd319 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 13 Sep 2019 17:40:32 -0500 Subject: drm/amdgpu: Avoid HW GPU reset for RAS. Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky Acked-by: Felix Kuehling Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 38 ++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 38 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 +++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 ++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 10 ++++---- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 ++++++++++--------- drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c | 5 ++++ drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 32 +++++++++++++------------ 12 files changed, 155 insertions(+), 42 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 324919d57c89..f6537476b542 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -35,6 +35,7 @@ #include "amdgpu_trace.h" #include "amdgpu_gmc.h" #include "amdgpu_gem.h" +#include "amdgpu_ras.h" static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, struct drm_amdgpu_cs_chunk_fence *data, @@ -1290,6 +1291,9 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) bool reserved_buffers = false; int i, r; + if (amdgpu_ras_intr_triggered()) + return -EHWPOISON; + if (!adev->accel_working) return -EBUSY; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a4d38637cdc6..98ff987ae940 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3736,25 +3736,18 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock) adev->mp1_state = PP_MP1_STATE_NONE; break; } - /* Block kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_pre_reset(adev); return true; } static void amdgpu_device_unlock_adev(struct amdgpu_device *adev) { - /*unlock kfd: SRIOV would do it separately */ - if (!amdgpu_sriov_vf(adev)) - amdgpu_amdkfd_post_reset(adev); amdgpu_vf_error_trans_all(adev); adev->mp1_state = PP_MP1_STATE_NONE; adev->in_gpu_reset = 0; mutex_unlock(&adev->lock_reset); } - /** * amdgpu_device_gpu_recover - reset the asic and recover scheduler * @@ -3774,11 +3767,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_hive_info *hive = NULL; struct amdgpu_device *tmp_adev = NULL; int i, r = 0; + bool in_ras_intr = amdgpu_ras_intr_triggered(); need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); - dev_info(adev->dev, "GPU reset begin!\n"); + dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset"); cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3805,9 +3799,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, return 0; } + /* Block kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_pre_reset(adev); + /* Build list of devices to reset */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!hive) { + /*unlock kfd: SRIOV would do it separately */ + if (!amdgpu_sriov_vf(adev)) + amdgpu_amdkfd_post_reset(adev); amdgpu_device_unlock_adev(adev); return -ENODEV; } @@ -3825,8 +3826,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block all schedulers and reset given job's ring */ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { - if (tmp_adev != adev) + if (tmp_adev != adev) { amdgpu_device_lock_adev(tmp_adev, false); + if (!amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_pre_reset(tmp_adev); + } + /* * Mark these ASICs to be reseted as untracked first * And add them back after reset completed @@ -3834,7 +3839,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, amdgpu_unregister_gpu_instance(tmp_adev); /* disable ras on ALL IPs */ - if (amdgpu_device_ip_need_full_reset(tmp_adev)) + if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev)) amdgpu_ras_suspend(tmp_adev); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { @@ -3844,10 +3849,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, continue; drm_sched_stop(&ring->sched, job ? &job->base : NULL); + + if (in_ras_intr) + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); } } + if (in_ras_intr) + goto skip_sched_resume; + /* * Must check guilty signal here since after this point all old * HW fences are force signaled. @@ -3906,6 +3917,7 @@ skip_hw_reset: /* Post ASIC reset for all devs .*/ list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = tmp_adev->rings[i]; @@ -3932,7 +3944,13 @@ skip_hw_reset: } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); } + } +skip_sched_resume: + list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) { + /*unlock kfd: SRIOV would do it separately */ + if (!in_ras_intr && !amdgpu_sriov_vf(tmp_adev)) + amdgpu_amdkfd_post_reset(tmp_adev); amdgpu_device_unlock_adev(tmp_adev); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 48a2070e72f2..62fe102ed39e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -42,6 +42,8 @@ #include "amdgpu_amdkfd.h" +#include "amdgpu_ras.h" + /* * KMS wrapper. * - 3.0.0 - initial driver @@ -1098,6 +1100,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = dev->dev_private; + if (amdgpu_ras_intr_triggered()) + return; + /* if we are running in a VM, make sure the device * torn down properly on reboot/shutdown. * unfortunately we can't detect certain diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9d76e0923a5a..e1bad992e83b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) return fence; } +#define to_drm_sched_job(sched_job) \ + container_of((sched_job), struct drm_sched_job, queue_node) + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched) +{ + struct drm_sched_job *s_job; + struct drm_sched_entity *s_entity = NULL; + int i; + + /* Signal all jobs not yet scheduled */ + for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) { + struct drm_sched_rq *rq = &sched->sched_rq[i]; + + if (!rq) + continue; + + spin_lock(&rq->lock); + list_for_each_entry(s_entity, &rq->entities, list) { + while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_signal(&s_fence->scheduled); + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } + } + spin_unlock(&rq->lock); + } + + /* Signal all jobs already scheduled to HW */ + list_for_each_entry(s_job, &sched->ring_mirror_list, node) { + struct drm_sched_fence *s_fence = s_job->s_fence; + + dma_fence_set_error(&s_fence->finished, -EHWPOISON); + dma_fence_signal(&s_fence->finished); + } +} + const struct drm_sched_backend_ops amdgpu_sched_ops = { .dependency = amdgpu_job_dependency, .run_job = amdgpu_job_run, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 51e62504c279..dc7ee9358dcd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -76,4 +76,7 @@ int amdgpu_job_submit(struct amdgpu_job *job, struct drm_sched_entity *entity, void *owner, struct dma_fence **f); int amdgpu_job_submit_direct(struct amdgpu_job *job, struct amdgpu_ring *ring, struct dma_fence **fence); + +void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index c28dc079a0a1..e42fe034aacd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1004,6 +1004,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) /* Ensure IB tests are run on ring */ flush_delayed_work(&adev->delayed_init_work); + + if (amdgpu_ras_intr_triggered()) { + DRM_ERROR("RAS Intr triggered, device disabled!!"); + return -EHWPOISON; + } + file_priv->driver_priv = NULL; r = pm_runtime_get_sync(dev->dev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0b466d101f53..d7bf8fc10869 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "amdgpu.h" #include "amdgpu_ras.h" @@ -66,6 +68,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) + +atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); + static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev, uint64_t offset, uint64_t size, struct amdgpu_bo **bo_ptr); @@ -190,6 +195,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } + +static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, + struct ras_common_if *head); + /** * DOC: AMDGPU RAS debugfs control interface * @@ -629,12 +638,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev, info->ue_count = obj->err_data.ue_count; info->ce_count = obj->err_data.ce_count; - if (err_data.ce_count) + if (err_data.ce_count) { dev_info(adev->dev, "%ld correctable errors detected in %s block\n", obj->err_data.ce_count, ras_block_str(info->head.block)); - if (err_data.ue_count) + } + if (err_data.ue_count) { dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n", obj->err_data.ue_count, ras_block_str(info->head.block)); + } return 0; } @@ -1731,3 +1742,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) return 0; } + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) +{ + if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + } +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 66b71525446e..6fda96b29f1f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -606,4 +606,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev, int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev, struct ras_dispatch_if *info); + +extern atomic_t amdgpu_ras_in_intr; + +static inline bool amdgpu_ras_intr_triggered(void) +{ + return !!atomic_read(&amdgpu_ras_in_intr); +} + +void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev); + #endif diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 6065f363fa85..196a14236445 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -5685,10 +5685,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct amdgpu_iv_entry *entry) { /* TODO ue will trigger an interrupt. */ - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, err_data); - amdgpu_ras_reset_gpu(adev, 0); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->gfx.funcs->query_ras_error_count) + adev->gfx.funcs->query_ras_error_count(adev, err_data); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 6a61e5c5b2ce..f1300d5f4f87 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -247,18 +247,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev, struct ras_err_data *err_data, struct amdgpu_iv_entry *entry) { - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, err_data); - /* umc query_ras_error_address is also responsible for clearing - * error status - */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, err_data); + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + if (adev->umc.funcs->query_ras_error_count) + adev->umc.funcs->query_ras_error_count(adev, err_data); + /* umc query_ras_error_address is also responsible for clearing + * error status + */ + if (adev->umc.funcs->query_ras_error_address) + adev->umc.funcs->query_ras_error_address(adev, err_data); - /* only uncorrectable error needs gpu reset */ - if (err_data->ue_count) - amdgpu_ras_reset_gpu(adev, 0); + /* only uncorrectable error needs gpu reset */ + if (err_data->ue_count) + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c index 5e784bbd2d7f..27eeab143ad7 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c @@ -30,6 +30,7 @@ #include "nbio/nbio_7_4_0_smn.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" #include +#include "amdgpu_ras.h" #define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c @@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device BIF_DOORBELL_INT_CNTL, RAS_CNTLR_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } @@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d BIF_DOORBELL_INT_CNTL, RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1); WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); + + amdgpu_ras_global_ras_isr(adev); } } diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c index 4830382ab8f2..4aabb0d9bae5 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c @@ -1979,24 +1979,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev, uint32_t err_source; int instance; - instance = sdma_v4_0_irq_id_to_seq(entry->client_id); - if (instance < 0) - return 0; + if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) { + instance = sdma_v4_0_irq_id_to_seq(entry->client_id); + if (instance < 0) + return 0; - switch (entry->src_id) { - case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: - err_source = 0; - break; - case SDMA0_4_0__SRCID__SDMA_ECC: - err_source = 1; - break; - default: - return 0; - } + switch (entry->src_id) { + case SDMA0_4_0__SRCID__SDMA_SRAM_ECC: + err_source = 0; + break; + case SDMA0_4_0__SRCID__SDMA_ECC: + err_source = 1; + break; + default: + return 0; + } - kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); + kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); - amdgpu_ras_reset_gpu(adev, 0); + amdgpu_ras_reset_gpu(adev, 0); + } return AMDGPU_RAS_SUCCESS; } -- cgit v1.2.3 From d5ea093eebf022ec69970107db45dc06318d7e5a Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Thu, 22 Aug 2019 15:01:37 -0400 Subject: dmr/amdgpu: Add system auto reboot to RAS. In case of RAS error allow user configure auto system reboot through ras_ctrl. This is also part of the temproray work around for the RAS hang problem. v4: Use latest kernel API for disk sync. Signed-off-by: Andrey Grodzovsky Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 ++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 98ff987ae940..e89aa2dc5c11 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -65,6 +65,8 @@ #include "amdgpu_ras.h" #include "amdgpu_pmu.h" +#include + MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin"); MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin"); @@ -3769,6 +3771,18 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + */ + if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + + DRM_WARN("Emergency reboot."); + + ksys_sync_helper(); + emergency_restart(); + } + need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d7bf8fc10869..270110db128f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -156,6 +156,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "reboot %32s", block_name) == 1) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -289,6 +291,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; + case 3: + amdgpu_ras_get_context(adev)->reboot = true; + break; default: ret = -EINVAL; break; @@ -1746,6 +1751,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + + amdgpu_ras_reset_gpu(adev, false); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 6fda96b29f1f..f487038ba331 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -334,7 +334,7 @@ struct amdgpu_ras { struct mutex recovery_lock; uint32_t flags; - + bool reboot; struct amdgpu_ras_eeprom_control eeprom_control; }; -- cgit v1.2.3 From 050091ab6e832f7df0b700f1b9b596613643ada4 Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Tue, 3 Sep 2019 17:55:30 -0400 Subject: drm/amdkfd: Query kfd device info by CHIP id instead of pci device id This optimizes out the pci device id usage in KFD and makes the code more maintainable. Signed-off-by: Yong Zhao Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 3 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 175 +++++------------------------ include/drm/amd_asic_type.h | 2 + 5 files changed, 37 insertions(+), 151 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 07eb29885372..9f061fe55d09 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -64,6 +64,7 @@ void amdgpu_amdkfd_fini(void) void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) { const struct kfd2kgd_calls *kfd2kgd; + bool vf = amdgpu_sriov_vf(adev); switch (adev->asic_type) { #ifdef CONFIG_DRM_AMDGPU_CIK @@ -101,7 +102,7 @@ void amdgpu_amdkfd_device_probe(struct amdgpu_device *adev) } adev->kfd.dev = kgd2kfd_probe((struct kgd_dev *)adev, - adev->pdev, kfd2kgd); + adev->pdev, kfd2kgd, adev->asic_type, vf); if (adev->kfd.dev) amdgpu_amdkfd_total_mem_size += adev->gmc.real_vram_size; @@ -735,7 +736,8 @@ struct kfd2kgd_calls *amdgpu_amdkfd_gfx_10_0_get_functions(void) } struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, - const struct kfd2kgd_calls *f2g) + const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf) { return NULL; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index c003d9275837..e39c106ac634 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -248,7 +248,8 @@ void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo); int kgd2kfd_init(void); void kgd2kfd_exit(void); struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, struct pci_dev *pdev, - const struct kfd2kgd_calls *f2g); + const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf); bool kgd2kfd_device_init(struct kfd_dev *kfd, const struct kgd2kfd_shared_resources *gpu_resources); void kgd2kfd_device_exit(struct kfd_dev *kfd); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index e89aa2dc5c11..3ad034aa0e3c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -80,7 +80,7 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin"); #define AMDGPU_RESUME_MS 2000 -static const char *amdgpu_asic_name[] = { +const char *amdgpu_asic_name[] = { "TAHITI", "PITCAIRN", "VERDE", diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0dc1084b5e82..267eb2e01bec 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -369,133 +369,23 @@ static const struct kfd_device_info navi10_device_info = { .num_sdma_queues_per_engine = 8, }; -struct kfd_deviceid { - unsigned short did; - const struct kfd_device_info *device_info; -}; - -static const struct kfd_deviceid supported_devices[] = { -#ifdef KFD_SUPPORT_IOMMU_V2 - { 0x1304, &kaveri_device_info }, /* Kaveri */ - { 0x1305, &kaveri_device_info }, /* Kaveri */ - { 0x1306, &kaveri_device_info }, /* Kaveri */ - { 0x1307, &kaveri_device_info }, /* Kaveri */ - { 0x1309, &kaveri_device_info }, /* Kaveri */ - { 0x130A, &kaveri_device_info }, /* Kaveri */ - { 0x130B, &kaveri_device_info }, /* Kaveri */ - { 0x130C, &kaveri_device_info }, /* Kaveri */ - { 0x130D, &kaveri_device_info }, /* Kaveri */ - { 0x130E, &kaveri_device_info }, /* Kaveri */ - { 0x130F, &kaveri_device_info }, /* Kaveri */ - { 0x1310, &kaveri_device_info }, /* Kaveri */ - { 0x1311, &kaveri_device_info }, /* Kaveri */ - { 0x1312, &kaveri_device_info }, /* Kaveri */ - { 0x1313, &kaveri_device_info }, /* Kaveri */ - { 0x1315, &kaveri_device_info }, /* Kaveri */ - { 0x1316, &kaveri_device_info }, /* Kaveri */ - { 0x1317, &kaveri_device_info }, /* Kaveri */ - { 0x1318, &kaveri_device_info }, /* Kaveri */ - { 0x131B, &kaveri_device_info }, /* Kaveri */ - { 0x131C, &kaveri_device_info }, /* Kaveri */ - { 0x131D, &kaveri_device_info }, /* Kaveri */ - { 0x9870, &carrizo_device_info }, /* Carrizo */ - { 0x9874, &carrizo_device_info }, /* Carrizo */ - { 0x9875, &carrizo_device_info }, /* Carrizo */ - { 0x9876, &carrizo_device_info }, /* Carrizo */ - { 0x9877, &carrizo_device_info }, /* Carrizo */ - { 0x15DD, &raven_device_info }, /* Raven */ - { 0x15D8, &raven_device_info }, /* Raven */ -#endif - { 0x67A0, &hawaii_device_info }, /* Hawaii */ - { 0x67A1, &hawaii_device_info }, /* Hawaii */ - { 0x67A2, &hawaii_device_info }, /* Hawaii */ - { 0x67A8, &hawaii_device_info }, /* Hawaii */ - { 0x67A9, &hawaii_device_info }, /* Hawaii */ - { 0x67AA, &hawaii_device_info }, /* Hawaii */ - { 0x67B0, &hawaii_device_info }, /* Hawaii */ - { 0x67B1, &hawaii_device_info }, /* Hawaii */ - { 0x67B8, &hawaii_device_info }, /* Hawaii */ - { 0x67B9, &hawaii_device_info }, /* Hawaii */ - { 0x67BA, &hawaii_device_info }, /* Hawaii */ - { 0x67BE, &hawaii_device_info }, /* Hawaii */ - { 0x6920, &tonga_device_info }, /* Tonga */ - { 0x6921, &tonga_device_info }, /* Tonga */ - { 0x6928, &tonga_device_info }, /* Tonga */ - { 0x6929, &tonga_device_info }, /* Tonga */ - { 0x692B, &tonga_device_info }, /* Tonga */ - { 0x6938, &tonga_device_info }, /* Tonga */ - { 0x6939, &tonga_device_info }, /* Tonga */ - { 0x7300, &fiji_device_info }, /* Fiji */ - { 0x730F, &fiji_vf_device_info }, /* Fiji vf*/ - { 0x67C0, &polaris10_device_info }, /* Polaris10 */ - { 0x67C1, &polaris10_device_info }, /* Polaris10 */ - { 0x67C2, &polaris10_device_info }, /* Polaris10 */ - { 0x67C4, &polaris10_device_info }, /* Polaris10 */ - { 0x67C7, &polaris10_device_info }, /* Polaris10 */ - { 0x67C8, &polaris10_device_info }, /* Polaris10 */ - { 0x67C9, &polaris10_device_info }, /* Polaris10 */ - { 0x67CA, &polaris10_device_info }, /* Polaris10 */ - { 0x67CC, &polaris10_device_info }, /* Polaris10 */ - { 0x67CF, &polaris10_device_info }, /* Polaris10 */ - { 0x67D0, &polaris10_vf_device_info }, /* Polaris10 vf*/ - { 0x67DF, &polaris10_device_info }, /* Polaris10 */ - { 0x6FDF, &polaris10_device_info }, /* Polaris10 */ - { 0x67E0, &polaris11_device_info }, /* Polaris11 */ - { 0x67E1, &polaris11_device_info }, /* Polaris11 */ - { 0x67E3, &polaris11_device_info }, /* Polaris11 */ - { 0x67E7, &polaris11_device_info }, /* Polaris11 */ - { 0x67E8, &polaris11_device_info }, /* Polaris11 */ - { 0x67E9, &polaris11_device_info }, /* Polaris11 */ - { 0x67EB, &polaris11_device_info }, /* Polaris11 */ - { 0x67EF, &polaris11_device_info }, /* Polaris11 */ - { 0x67FF, &polaris11_device_info }, /* Polaris11 */ - { 0x6980, &polaris12_device_info }, /* Polaris12 */ - { 0x6981, &polaris12_device_info }, /* Polaris12 */ - { 0x6985, &polaris12_device_info }, /* Polaris12 */ - { 0x6986, &polaris12_device_info }, /* Polaris12 */ - { 0x6987, &polaris12_device_info }, /* Polaris12 */ - { 0x6995, &polaris12_device_info }, /* Polaris12 */ - { 0x6997, &polaris12_device_info }, /* Polaris12 */ - { 0x699F, &polaris12_device_info }, /* Polaris12 */ - { 0x694C, &vegam_device_info }, /* VegaM */ - { 0x694E, &vegam_device_info }, /* VegaM */ - { 0x694F, &vegam_device_info }, /* VegaM */ - { 0x6860, &vega10_device_info }, /* Vega10 */ - { 0x6861, &vega10_device_info }, /* Vega10 */ - { 0x6862, &vega10_device_info }, /* Vega10 */ - { 0x6863, &vega10_device_info }, /* Vega10 */ - { 0x6864, &vega10_device_info }, /* Vega10 */ - { 0x6867, &vega10_device_info }, /* Vega10 */ - { 0x6868, &vega10_device_info }, /* Vega10 */ - { 0x6869, &vega10_device_info }, /* Vega10 */ - { 0x686A, &vega10_device_info }, /* Vega10 */ - { 0x686B, &vega10_device_info }, /* Vega10 */ - { 0x686C, &vega10_vf_device_info }, /* Vega10 vf*/ - { 0x686D, &vega10_device_info }, /* Vega10 */ - { 0x686E, &vega10_device_info }, /* Vega10 */ - { 0x686F, &vega10_device_info }, /* Vega10 */ - { 0x687F, &vega10_device_info }, /* Vega10 */ - { 0x69A0, &vega12_device_info }, /* Vega12 */ - { 0x69A1, &vega12_device_info }, /* Vega12 */ - { 0x69A2, &vega12_device_info }, /* Vega12 */ - { 0x69A3, &vega12_device_info }, /* Vega12 */ - { 0x69AF, &vega12_device_info }, /* Vega12 */ - { 0x66a0, &vega20_device_info }, /* Vega20 */ - { 0x66a1, &vega20_device_info }, /* Vega20 */ - { 0x66a2, &vega20_device_info }, /* Vega20 */ - { 0x66a3, &vega20_device_info }, /* Vega20 */ - { 0x66a4, &vega20_device_info }, /* Vega20 */ - { 0x66a7, &vega20_device_info }, /* Vega20 */ - { 0x66af, &vega20_device_info }, /* Vega20 */ - { 0x738C, &arcturus_device_info }, /* Arcturus */ - { 0x7388, &arcturus_device_info }, /* Arcturus */ - { 0x738E, &arcturus_device_info }, /* Arcturus */ - { 0x7390, &arcturus_device_info }, /* Arcturus vf */ - { 0x7310, &navi10_device_info }, /* Navi10 */ - { 0x7312, &navi10_device_info }, /* Navi10 */ - { 0x7318, &navi10_device_info }, /* Navi10 */ - { 0x731a, &navi10_device_info }, /* Navi10 */ - { 0x731f, &navi10_device_info }, /* Navi10 */ +/* For each entry, [0] is regular and [1] is virtualisation device. */ +static const struct kfd_device_info *kfd_supported_devices[][2] = { + [CHIP_KAVERI] = {&kaveri_device_info, NULL}, + [CHIP_HAWAII] = {&hawaii_device_info, NULL}, + [CHIP_TONGA] = {&tonga_device_info, NULL}, + [CHIP_FIJI] = {&fiji_device_info, &fiji_vf_device_info}, + [CHIP_CARRIZO] = {&carrizo_device_info, NULL}, + [CHIP_POLARIS10] = {&polaris10_device_info, &polaris10_vf_device_info}, + [CHIP_POLARIS11] = {&polaris11_device_info, NULL}, + [CHIP_POLARIS12] = {&polaris12_device_info, NULL}, + [CHIP_VEGAM] = {&vegam_device_info, NULL}, + [CHIP_VEGA10] = {&vega10_device_info, &vega10_vf_device_info}, + [CHIP_VEGA12] = {&vega12_device_info, NULL}, + [CHIP_VEGA20] = {&vega20_device_info, NULL}, + [CHIP_RAVEN] = {&raven_device_info, NULL}, + [CHIP_ARCTURUS] = {&arcturus_device_info, &arcturus_device_info}, + [CHIP_NAVI10] = {&navi10_device_info, NULL}, }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -504,32 +394,23 @@ static void kfd_gtt_sa_fini(struct kfd_dev *kfd); static int kfd_resume(struct kfd_dev *kfd); -static const struct kfd_device_info *lookup_device_info(unsigned short did) +struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, + struct pci_dev *pdev, const struct kfd2kgd_calls *f2g, + unsigned int asic_type, bool vf) { - size_t i; + struct kfd_dev *kfd; + const struct kfd_device_info *device_info; - for (i = 0; i < ARRAY_SIZE(supported_devices); i++) { - if (supported_devices[i].did == did) { - WARN_ON(!supported_devices[i].device_info); - return supported_devices[i].device_info; - } + if (asic_type >= sizeof(kfd_supported_devices) / (sizeof(void *) * 2)) { + dev_err(kfd_device, "asic_type %d out of range\n", asic_type); + return NULL; /* asic_type out of range */ } - dev_warn(kfd_device, "DID %04x is missing in supported_devices\n", - did); - - return NULL; -} - -struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd, - struct pci_dev *pdev, const struct kfd2kgd_calls *f2g) -{ - struct kfd_dev *kfd; - const struct kfd_device_info *device_info = - lookup_device_info(pdev->device); + device_info = kfd_supported_devices[asic_type][vf]; if (!device_info) { - dev_err(kfd_device, "kgd2kfd_probe failed\n"); + dev_err(kfd_device, "%s %s not supported in kfd\n", + amdgpu_asic_name[asic_type], vf ? "VF" : ""); return NULL; } diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index 296aab724677..8d79940cb602 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -57,4 +57,6 @@ enum amd_asic_type { CHIP_LAST, }; +extern const char *amdgpu_asic_name[]; + #endif /*__AMD_ASIC_TYPE_H__ */ -- cgit v1.2.3 From 1a6fc071e1991321d3b6a00e0e7c733a462a4418 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Fri, 30 Aug 2019 19:50:39 +0800 Subject: drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place ras recovery_init should be called after ttm init, bad page reserve should be put in front of gpu reset since i2c may be unstable during gpu reset. add cleanup for recovery_init and recovery_fini v2: add more comment and print. remove cancel_work_sync in recovery_init. Signed-off-by: Tao Zhou Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 39 ++++++++++++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++++ 4 files changed, 43 insertions(+), 18 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3ad034aa0e3c..3268291babf8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, break; } } - - list_for_each_entry(tmp_adev, device_list_handle, - gmc.xgmi.head) { - amdgpu_ras_reserve_bad_pages(tmp_adev); - } } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 53540e067d15..e9bd40ea7ce0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1493,16 +1493,17 @@ out: return 0; } -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data = &con->eh_data; int ret; - *data = kmalloc(sizeof(**data), - GFP_KERNEL|__GFP_ZERO); - if (!*data) - return -ENOMEM; + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); + if (!*data) { + ret = -ENOMEM; + goto out; + } mutex_init(&con->recovery_lock); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); @@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); if (ret) - return ret; + goto free; if (adev->psp.ras.ras->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) - return ret; + goto free; ret = amdgpu_ras_reserve_bad_pages(adev); if (ret) - return ret; + goto release; } return 0; + +release: + amdgpu_ras_release_bad_pages(adev); +free: + con->eh_data = NULL; + kfree((*data)->bps); + kfree((*data)->bps_bo); + kfree(*data); +out: + DRM_WARN("Failed to initialize ras recovery!\n"); + + return ret; } static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data = con->eh_data; + /* recovery_init failed to init it, fini is useless */ + if (!data) + return 0; + cancel_work_sync(&con->recovery_work); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); + kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) return r; } - if (amdgpu_ras_recovery_init(adev)) - goto recovery_out; - amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; if (amdgpu_ras_fs_init(adev)) @@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) con->hw_supported, con->supported); return 0; fs_out: - amdgpu_ras_recovery_fini(adev); -recovery_out: amdgpu_ras_set_context(adev, NULL); kfree(con); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 96210e18191e..012034d2ae06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ras && (ras->supported & (1 << block)); } +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, unsigned int block); @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, { struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + /* save bad page to eeprom before gpu reset, + * i2c may be unstable in gpu reset + */ + amdgpu_ras_reserve_bad_pages(adev); if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) schedule_work(&ras->recovery_work); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 13b144c8f67d..54e6dacc34a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -54,6 +54,7 @@ #include "amdgpu_trace.h" #include "amdgpu_amdkfd.h" #include "amdgpu_sdma.h" +#include "amdgpu_ras.h" #include "bif/bif_4_1_d.h" static int amdgpu_map_buffer(struct ttm_buffer_object *bo, @@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) adev->gmc.visible_vram_size); #endif + /* + * retired pages will be loaded from eeprom and reserved here, + * it should be called after ttm init since new bo may be created, + * recovery_init may fail, but it can free all resources allocated by + * itself and its failure should not stop amdgpu init process. + * + * Note: theoretically, this should be called before all vram allocations + * to protect retired page from abusing + */ + amdgpu_ras_recovery_init(adev); + /* *The reserved vram for firmware must be pinned to the specified *place on the VRAM, so reserve it early. -- cgit v1.2.3 From 4e66d7d2156243bf6866fc83cda0c4b115627e6d Mon Sep 17 00:00:00 2001 From: Yong Zhao Date: Fri, 30 Aug 2019 18:09:10 -0400 Subject: drm/amdgpu: Add a kernel parameter for specifying the asic type As more and more new asics start to reuse the old device IDs before launch, there is a need to quickly override the existing asic type corresponding to the reused device ID through a kernel parameter. With this, engineers no longer need to rely on local hack patches, facilitating cooperation across teams. Signed-off-by: Yong Zhao Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 11 ++++++ include/drm/amd_asic_type.h | 54 +++++++++++++++--------------- 4 files changed, 45 insertions(+), 28 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dd9e18f155f0..7723f4ed0130 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -168,6 +168,7 @@ extern int amdgpu_mcbp; extern int amdgpu_discovery; extern int amdgpu_mes; extern int amdgpu_noretry; +extern int amdgpu_force_asic_type; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3268291babf8..b09237b6531c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2585,7 +2585,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->ddev = ddev; adev->pdev = pdev; adev->flags = flags; - adev->asic_type = flags & AMD_ASIC_MASK; + + if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST) + adev->asic_type = amdgpu_force_asic_type; + else + adev->asic_type = flags & AMD_ASIC_MASK; + adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT; if (amdgpu_emu_mode == 1) adev->usec_timeout *= 2; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 62fe102ed39e..0f0d6fb6de32 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -146,6 +146,7 @@ int amdgpu_mcbp = 0; int amdgpu_discovery = -1; int amdgpu_mes = 0; int amdgpu_noretry = 1; +int amdgpu_force_asic_type = -1; struct amdgpu_mgpu_info mgpu_info = { .mutex = __MUTEX_INITIALIZER(mgpu_info.mutex), @@ -616,6 +617,16 @@ MODULE_PARM_DESC(noretry, "Disable retry faults (0 = retry enabled, 1 = retry disabled (default))"); module_param_named(noretry, amdgpu_noretry, int, 0644); +/** + * DOC: force_asic_type (int) + * A non negative value used to specify the asic type for all supported GPUs. + */ +MODULE_PARM_DESC(force_asic_type, + "A non negative value used to specify the asic type for all supported GPUs"); +module_param_named(force_asic_type, amdgpu_force_asic_type, int, 0444); + + + #ifdef CONFIG_HSA_AMD /** * DOC: sched_policy (int) diff --git a/include/drm/amd_asic_type.h b/include/drm/amd_asic_type.h index 8d79940cb602..b1230e33d506 100644 --- a/include/drm/amd_asic_type.h +++ b/include/drm/amd_asic_type.h @@ -27,33 +27,33 @@ */ enum amd_asic_type { CHIP_TAHITI = 0, - CHIP_PITCAIRN, - CHIP_VERDE, - CHIP_OLAND, - CHIP_HAINAN, - CHIP_BONAIRE, - CHIP_KAVERI, - CHIP_KABINI, - CHIP_HAWAII, - CHIP_MULLINS, - CHIP_TOPAZ, - CHIP_TONGA, - CHIP_FIJI, - CHIP_CARRIZO, - CHIP_STONEY, - CHIP_POLARIS10, - CHIP_POLARIS11, - CHIP_POLARIS12, - CHIP_VEGAM, - CHIP_VEGA10, - CHIP_VEGA12, - CHIP_VEGA20, - CHIP_RAVEN, - CHIP_ARCTURUS, - CHIP_RENOIR, - CHIP_NAVI10, - CHIP_NAVI14, - CHIP_NAVI12, + CHIP_PITCAIRN, /* 1 */ + CHIP_VERDE, /* 2 */ + CHIP_OLAND, /* 3 */ + CHIP_HAINAN, /* 4 */ + CHIP_BONAIRE, /* 5 */ + CHIP_KAVERI, /* 6 */ + CHIP_KABINI, /* 7 */ + CHIP_HAWAII, /* 8 */ + CHIP_MULLINS, /* 9 */ + CHIP_TOPAZ, /* 10 */ + CHIP_TONGA, /* 11 */ + CHIP_FIJI, /* 12 */ + CHIP_CARRIZO, /* 13 */ + CHIP_STONEY, /* 14 */ + CHIP_POLARIS10, /* 15 */ + CHIP_POLARIS11, /* 16 */ + CHIP_POLARIS12, /* 17 */ + CHIP_VEGAM, /* 18 */ + CHIP_VEGA10, /* 19 */ + CHIP_VEGA12, /* 20 */ + CHIP_VEGA20, /* 21 */ + CHIP_RAVEN, /* 22 */ + CHIP_ARCTURUS, /* 23 */ + CHIP_RENOIR, /* 24 */ + CHIP_NAVI10, /* 25 */ + CHIP_NAVI14, /* 26 */ + CHIP_NAVI12, /* 27 */ CHIP_LAST, }; -- cgit v1.2.3 From aa978594cf7fa0ad91d0671bb1a33ec38fbd3d25 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 2 Sep 2019 23:34:32 +0800 Subject: drm/amdgpu: disable gfxoff while use no H/W scheduling policy While gfxoff is enabled, the mmVM_XXX registers will be 0xfffffff while the GFX is in "off" state. KFD queue creattion doesn't use ring based method, so it will trigger a VM fault. Signed-off-by: Huang Rui Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7723f4ed0130..733331837d92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -169,6 +169,7 @@ extern int amdgpu_discovery; extern int amdgpu_mes; extern int amdgpu_noretry; extern int amdgpu_force_asic_type; +extern int sched_policy; #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b09237b6531c..1affaa4ee1ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1624,7 +1624,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } adev->pm.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev)) + if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { -- cgit v1.2.3 From 8c9f69bc5cc40ca949a9f0b12e05d81bb5a9f446 Mon Sep 17 00:00:00 2001 From: Shirish S Date: Tue, 10 Sep 2019 12:45:01 +0530 Subject: drm/amdgpu: fix build error without CONFIG_HSA_AMD If CONFIG_HSA_AMD is not set, build fails: drivers/gpu/drm/amd/amdgpu/amdgpu_device.o: In function `amdgpu_device_ip_early_init': drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:1626: undefined reference to `sched_policy' Use CONFIG_HSA_AMD to guard this. Fixes: 1abb680ad371 ("drm/amdgpu: disable gfxoff while use no H/W scheduling policy") Signed-off-by: Shirish S Reviewed-by: Huang Rui Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 733331837d92..d1bd0c5d81a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -169,7 +169,9 @@ extern int amdgpu_discovery; extern int amdgpu_mes; extern int amdgpu_noretry; extern int amdgpu_force_asic_type; +#ifdef CONFIG_HSA_AMD extern int sched_policy; +#endif #ifdef CONFIG_DRM_AMDGPU_SI extern int amdgpu_si_support; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1affaa4ee1ee..b893ec935b84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1624,7 +1624,11 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } adev->pm.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) + if (amdgpu_sriov_vf(adev) + #ifdef CONFIG_HSA_AMD + || sched_policy == KFD_SCHED_POLICY_NO_HWS + #endif + ) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { -- cgit v1.2.3 From a35ad98bf9d37bdcb46d918edebf87334bfad321 Mon Sep 17 00:00:00 2001 From: Shirish S Date: Thu, 12 Sep 2019 12:03:55 +0530 Subject: drm/amdgpu: remove needless usage of #ifdef define sched_policy in case CONFIG_HSA_AMD is not enabled, with this there is no need to check for CONFIG_HSA_AMD else where in driver code. Suggested-by: Felix Kuehling Signed-off-by: Shirish S Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d1bd0c5d81a1..20a37ce56950 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -171,6 +171,8 @@ extern int amdgpu_noretry; extern int amdgpu_force_asic_type; #ifdef CONFIG_HSA_AMD extern int sched_policy; +#else +static const int sched_policy = KFD_SCHED_POLICY_HWS; #endif #ifdef CONFIG_DRM_AMDGPU_SI diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b893ec935b84..1affaa4ee1ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1624,11 +1624,7 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) } adev->pm.pp_feature = amdgpu_pp_feature_mask; - if (amdgpu_sriov_vf(adev) - #ifdef CONFIG_HSA_AMD - || sched_policy == KFD_SCHED_POLICY_NO_HWS - #endif - ) + if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS) adev->pm.pp_feature &= ~PP_GFXOFF_MASK; for (i = 0; i < adev->num_ip_blocks; i++) { -- cgit v1.2.3 From 0e0b89c0d7cd1d7a53d0e7538a9d07c5692cf04d Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Wed, 11 Sep 2019 19:35:45 +0800 Subject: drm/amd/powerplay: properly set mp1 state for SW SMU suspend/reset routine Set mp1 state properly for SW SMU suspend/reset routine. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++---- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 40 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h | 2 ++ 3 files changed, 48 insertions(+), 6 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1affaa4ee1ee..d3524f19d79a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2233,17 +2233,17 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* handle putting the SMC in the appropriate state */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { if (is_support_sw_smu(adev)) { - /* todo */ + r = smu_set_mp1_state(&adev->smu, adev->mp1_state); } else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->set_mp1_state) { r = adev->powerplay.pp_funcs->set_mp1_state( adev->powerplay.pp_handle, adev->mp1_state); - if (r) { - DRM_ERROR("SMC failed to set mp1 state %d, %d\n", - adev->mp1_state, r); - return r; - } + } + if (r) { + DRM_ERROR("SMC failed to set mp1 state %d, %d\n", + adev->mp1_state, r); + return r; } } diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 03896e667c14..be01b88db3ec 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1788,6 +1788,46 @@ int smu_force_clk_levels(struct smu_context *smu, return ret; } +int smu_set_mp1_state(struct smu_context *smu, + enum pp_mp1_state mp1_state) +{ + uint16_t msg; + int ret; + + /* + * The SMC is not fully ready. That may be + * expected as the IP may be masked. + * So, just return without error. + */ + if (!smu->pm_enabled) + return 0; + + switch (mp1_state) { + case PP_MP1_STATE_SHUTDOWN: + msg = SMU_MSG_PrepareMp1ForShutdown; + break; + case PP_MP1_STATE_UNLOAD: + msg = SMU_MSG_PrepareMp1ForUnload; + break; + case PP_MP1_STATE_RESET: + msg = SMU_MSG_PrepareMp1ForReset; + break; + case PP_MP1_STATE_NONE: + default: + return 0; + } + + /* some asics may not support those messages */ + if (smu_msg_get_index(smu, msg) < 0) + return 0; + + ret = smu_send_smc_msg(smu, msg); + if (ret) + pr_err("[PrepareMp1] Failed!\n"); + + return ret; +} + const struct amd_ip_funcs smu_ip_funcs = { .name = "smu", .early_init = smu_early_init, diff --git a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h index 88f1ee9a2f1d..45da21dc2356 100644 --- a/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/powerplay/inc/amdgpu_smu.h @@ -836,5 +836,7 @@ int smu_sys_set_pp_feature_mask(struct smu_context *smu, uint64_t new_mask); int smu_force_clk_levels(struct smu_context *smu, enum smu_clk_type clk_type, uint32_t mask); +int smu_set_mp1_state(struct smu_context *smu, + enum pp_mp1_state mp1_state); #endif -- cgit v1.2.3 From ec51d3facd3eb2128d9ea914e461885630b4ad07 Mon Sep 17 00:00:00 2001 From: Xiaojie Yuan Date: Wed, 11 Sep 2019 11:41:05 +0800 Subject: drm/amdgpu/discovery: get gpu info from ip discovery table except soc_bounding_box which is not integrated in discovery table yet Signed-off-by: Xiaojie Yuan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d3524f19d79a..9f916a331459 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1471,6 +1471,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data + le32_to_cpu(hdr->header.ucode_array_offset_bytes)); + if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + goto parse_soc_bounding_box; + adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se); adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh); adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se); @@ -1498,7 +1501,13 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev) adev->gfx.config.num_packer_per_sc = le32_to_cpu(gpu_info_fw->num_packer_per_sc); } + +parse_soc_bounding_box: #ifdef CONFIG_DRM_AMD_DC_DCN2_0 + /* + * soc bounding box info is not integrated in disocovery table, + * we always need to parse it from gpu info firmware. + */ if (hdr->version_minor == 2) { const struct gpu_info_firmware_v1_2 *gpu_info_fw = (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data + @@ -1615,6 +1624,9 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (r) return r; + if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) + amdgpu_discovery_get_gfx_info(adev); + amdgpu_amdkfd_device_probe(adev); if (amdgpu_sriov_vf(adev)) { -- cgit v1.2.3 From df99ac0fcc507cb680c5b75b679cda1195a99a42 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Tue, 30 Jul 2019 19:15:42 +0800 Subject: drm/amd/amdgpu:Fix compute ring unable to detect hang. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When compute fence did not signal, compute ring cannot detect hardware hang because its timeout value is set to be infinite by default. In SR-IOV and passthrough mode, if user does not declare custome timeout value for compute ring, then use gfx ring timeout value as default. So that when there is a ture hardware hang, compute ring can detect it. Signed-off-by: Jesse Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 ++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 7 ++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 13 +------------ 3 files changed, 13 insertions(+), 19 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9f916a331459..1364a2be68e0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1025,12 +1025,6 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) amdgpu_device_check_block_size(adev); - ret = amdgpu_device_get_job_timeout_settings(adev); - if (ret) { - dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); - return ret; - } - adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); return ret; @@ -2745,6 +2739,12 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (r) return r; + r = amdgpu_device_get_job_timeout_settings(adev); + if (r) { + dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n"); + return r; + } + /* doorbell bar mapping and doorbell index init*/ amdgpu_device_doorbell_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index e2f166a32c92..864f7858d630 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1341,10 +1341,15 @@ int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) /* * By default timeout for non compute jobs is 10000. * And there is no timeout enforced on compute jobs. + * In SR-IOV or passthrough mode, timeout for compute + * jobs are 10000 by default. */ adev->gfx_timeout = msecs_to_jiffies(10000); adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + adev->compute_timeout = adev->gfx_timeout; + else + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { while ((timeout_setting = strsep(&input, ",")) && diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 23085b352cf2..377fe20bce23 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -462,18 +462,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring, timeout = adev->gfx_timeout; break; case AMDGPU_RING_TYPE_COMPUTE: - /* - * For non-sriov case, no timeout enforce - * on compute ring by default. Unless user - * specifies a timeout for compute ring. - * - * For sriov case, always use the timeout - * as gfx ring - */ - if (!amdgpu_sriov_vf(ring->adev)) - timeout = adev->compute_timeout; - else - timeout = adev->gfx_timeout; + timeout = adev->compute_timeout; break; case AMDGPU_RING_TYPE_SDMA: timeout = adev->sdma_timeout; -- cgit v1.2.3 From f8d2d39eb406946546d71e5869b04d99fa094834 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Tue, 3 Sep 2019 16:46:01 -0400 Subject: drm/amdgpu: Iterate through DRM connectors correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, every single piece of code in amdgpu that loops through connectors does it incorrectly and doesn't use the proper list iteration helpers, drm_connector_list_iter_begin() and drm_connector_list_iter_end(). Yeesh. So, do that. Cc: Juston Li Cc: Imre Deak Cc: Ville Syrjälä Cc: Harry Wentland Cc: Daniel Vetter Reviewed-by: Alex Deucher Signed-off-by: Lyude Paul Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c | 13 +++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 +++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 5 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_encoders.c | 40 +++++++++++++++------- drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 5 ++- drivers/gpu/drm/amd/amdgpu/dce_v10_0.c | 34 +++++++++++++----- drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 34 +++++++++++++----- drivers/gpu/drm/amd/amdgpu/dce_v6_0.c | 40 ++++++++++++++++------ drivers/gpu/drm/amd/amdgpu/dce_v8_0.c | 34 +++++++++++++----- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 33 ++++++++++-------- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c | 10 ++++-- 11 files changed, 195 insertions(+), 73 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c index ece55c8fa673..bd31bb595c04 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_connectors.c @@ -1022,8 +1022,12 @@ amdgpu_connector_dvi_detect(struct drm_connector *connector, bool force) */ if (amdgpu_connector->shared_ddc && (ret == connector_status_connected)) { struct drm_connector *list_connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *list_amdgpu_connector; - list_for_each_entry(list_connector, &dev->mode_config.connector_list, head) { + + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(list_connector, + &iter) { if (connector == list_connector) continue; list_amdgpu_connector = to_amdgpu_connector(list_connector); @@ -1040,6 +1044,7 @@ amdgpu_connector_dvi_detect(struct drm_connector *connector, bool force) } } } + drm_connector_list_iter_end(&iter); } } } @@ -1501,6 +1506,7 @@ amdgpu_connector_add(struct amdgpu_device *adev, { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector; struct amdgpu_connector_atom_dig *amdgpu_dig_connector; struct drm_encoder *encoder; @@ -1515,10 +1521,12 @@ amdgpu_connector_add(struct amdgpu_device *adev, return; /* see if we already added it */ - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->connector_id == connector_id) { amdgpu_connector->devices |= supported_device; + drm_connector_list_iter_end(&iter); return; } if (amdgpu_connector->ddc_bus && i2c_bus->valid) { @@ -1533,6 +1541,7 @@ amdgpu_connector_add(struct amdgpu_device *adev, } } } + drm_connector_list_iter_end(&iter); /* check if it's a dp bridge */ list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1364a2be68e0..62955156653c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3026,6 +3026,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) struct amdgpu_device *adev; struct drm_crtc *crtc; struct drm_connector *connector; + struct drm_connector_list_iter iter; int r; if (dev == NULL || dev->dev_private == NULL) { @@ -3048,9 +3049,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) if (!amdgpu_device_has_dc_support(adev)) { /* turn off display hw */ drm_modeset_lock_all(dev); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF); - } + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) + drm_helper_connector_dpms(connector, + DRM_MODE_DPMS_OFF); + drm_connector_list_iter_end(&iter); drm_modeset_unlock_all(dev); /* unpin the front buffers and cursors */ list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) { @@ -3129,6 +3132,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) { struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_device *adev = dev->dev_private; struct drm_crtc *crtc; int r = 0; @@ -3199,9 +3203,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon) /* turn on display hw */ drm_modeset_lock_all(dev); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { - drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON); - } + + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) + drm_helper_connector_dpms(connector, + DRM_MODE_DPMS_ON); + drm_connector_list_iter_end(&iter); + drm_modeset_unlock_all(dev); } amdgpu_fbdev_set_suspend(adev, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c index 1d4aaa9580f4..d2dd59a95e8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c @@ -370,11 +370,13 @@ void amdgpu_display_print_display_setup(struct drm_device *dev) struct amdgpu_connector *amdgpu_connector; struct drm_encoder *encoder; struct amdgpu_encoder *amdgpu_encoder; + struct drm_connector_list_iter iter; uint32_t devices; int i = 0; + drm_connector_list_iter_begin(dev, &iter); DRM_INFO("AMDGPU Display Connectors\n"); - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_for_each_connector_iter(connector, &iter) { amdgpu_connector = to_amdgpu_connector(connector); DRM_INFO("Connector %d:\n", i); DRM_INFO(" %s\n", connector->name); @@ -438,6 +440,7 @@ void amdgpu_display_print_display_setup(struct drm_device *dev) } i++; } + drm_connector_list_iter_end(&iter); } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_encoders.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_encoders.c index 571a6dfb473e..61fcf247a638 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_encoders.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_encoders.c @@ -37,12 +37,14 @@ amdgpu_link_encoder_connector(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector; struct drm_encoder *encoder; struct amdgpu_encoder *amdgpu_encoder; + drm_connector_list_iter_begin(dev, &iter); /* walk the list and link encoders to connectors */ - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_for_each_connector_iter(connector, &iter) { amdgpu_connector = to_amdgpu_connector(connector); list_for_each_entry(encoder, &dev->mode_config.encoder_list, head) { amdgpu_encoder = to_amdgpu_encoder(encoder); @@ -55,6 +57,7 @@ amdgpu_link_encoder_connector(struct drm_device *dev) } } } + drm_connector_list_iter_end(&iter); } void amdgpu_encoder_set_active_device(struct drm_encoder *encoder) @@ -62,8 +65,10 @@ void amdgpu_encoder_set_active_device(struct drm_encoder *encoder) struct drm_device *dev = encoder->dev; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct drm_connector *connector; + struct drm_connector_list_iter iter; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); amdgpu_encoder->active_device = amdgpu_encoder->devices & amdgpu_connector->devices; @@ -72,6 +77,7 @@ void amdgpu_encoder_set_active_device(struct drm_encoder *encoder) amdgpu_connector->devices, encoder->encoder_type); } } + drm_connector_list_iter_end(&iter); } struct drm_connector * @@ -79,15 +85,20 @@ amdgpu_get_connector_for_encoder(struct drm_encoder *encoder) { struct drm_device *dev = encoder->dev; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); - struct drm_connector *connector; + struct drm_connector *connector, *found = NULL; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { amdgpu_connector = to_amdgpu_connector(connector); - if (amdgpu_encoder->active_device & amdgpu_connector->devices) - return connector; + if (amdgpu_encoder->active_device & amdgpu_connector->devices) { + found = connector; + break; + } } - return NULL; + drm_connector_list_iter_end(&iter); + return found; } struct drm_connector * @@ -95,15 +106,20 @@ amdgpu_get_connector_for_encoder_init(struct drm_encoder *encoder) { struct drm_device *dev = encoder->dev; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); - struct drm_connector *connector; + struct drm_connector *connector, *found = NULL; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { amdgpu_connector = to_amdgpu_connector(connector); - if (amdgpu_encoder->devices & amdgpu_connector->devices) - return connector; + if (amdgpu_encoder->devices & amdgpu_connector->devices) { + found = connector; + break; + } } - return NULL; + drm_connector_list_iter_end(&iter); + return found; } struct drm_encoder *amdgpu_get_external_encoder(struct drm_encoder *encoder) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c index 8f49d8131d63..d1d5e7f9b9be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c @@ -87,10 +87,13 @@ static void amdgpu_hotplug_work_func(struct work_struct *work) struct drm_device *dev = adev->ddev; struct drm_mode_config *mode_config = &dev->mode_config; struct drm_connector *connector; + struct drm_connector_list_iter iter; mutex_lock(&mode_config->mutex); - list_for_each_entry(connector, &mode_config->connector_list, head) + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) amdgpu_connector_hotplug(connector); + drm_connector_list_iter_end(&iter); mutex_unlock(&mode_config->mutex); /* Just fire off a uevent and let userspace tell us what to do */ drm_helper_hpd_irq_event(dev); diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c index f7aaa3fa5c95..40d2ac723dd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c @@ -330,9 +330,11 @@ static void dce_v10_0_hpd_init(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -368,6 +370,7 @@ static void dce_v10_0_hpd_init(struct amdgpu_device *adev) amdgpu_irq_get(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } /** @@ -382,9 +385,11 @@ static void dce_v10_0_hpd_fini(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -397,6 +402,7 @@ static void dce_v10_0_hpd_fini(struct amdgpu_device *adev) amdgpu_irq_put(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } static u32 dce_v10_0_hpd_get_gpio_reg(struct amdgpu_device *adev) @@ -1219,10 +1225,12 @@ static void dce_v10_0_afmt_audio_select_pin(struct drm_encoder *encoder) static void dce_v10_0_audio_write_latency_fields(struct drm_encoder *encoder, struct drm_display_mode *mode) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 tmp; int interlace = 0; @@ -1230,12 +1238,14 @@ static void dce_v10_0_audio_write_latency_fields(struct drm_encoder *encoder, if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1261,10 +1271,12 @@ static void dce_v10_0_audio_write_latency_fields(struct drm_encoder *encoder, static void dce_v10_0_audio_write_speaker_allocation(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 tmp; u8 *sadb = NULL; @@ -1273,12 +1285,14 @@ static void dce_v10_0_audio_write_speaker_allocation(struct drm_encoder *encoder if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1313,10 +1327,12 @@ static void dce_v10_0_audio_write_speaker_allocation(struct drm_encoder *encoder static void dce_v10_0_audio_write_sad_regs(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; struct cea_sad *sads; int i, sad_count; @@ -1339,12 +1355,14 @@ static void dce_v10_0_audio_write_sad_regs(struct drm_encoder *encoder) if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c index ed1021b4af3c..898ef72d423c 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c @@ -348,9 +348,11 @@ static void dce_v11_0_hpd_init(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -385,6 +387,7 @@ static void dce_v11_0_hpd_init(struct amdgpu_device *adev) dce_v11_0_hpd_set_polarity(adev, amdgpu_connector->hpd.hpd); amdgpu_irq_get(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } /** @@ -399,9 +402,11 @@ static void dce_v11_0_hpd_fini(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -413,6 +418,7 @@ static void dce_v11_0_hpd_fini(struct amdgpu_device *adev) amdgpu_irq_put(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } static u32 dce_v11_0_hpd_get_gpio_reg(struct amdgpu_device *adev) @@ -1245,10 +1251,12 @@ static void dce_v11_0_afmt_audio_select_pin(struct drm_encoder *encoder) static void dce_v11_0_audio_write_latency_fields(struct drm_encoder *encoder, struct drm_display_mode *mode) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 tmp; int interlace = 0; @@ -1256,12 +1264,14 @@ static void dce_v11_0_audio_write_latency_fields(struct drm_encoder *encoder, if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1287,10 +1297,12 @@ static void dce_v11_0_audio_write_latency_fields(struct drm_encoder *encoder, static void dce_v11_0_audio_write_speaker_allocation(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 tmp; u8 *sadb = NULL; @@ -1299,12 +1311,14 @@ static void dce_v11_0_audio_write_speaker_allocation(struct drm_encoder *encoder if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1339,10 +1353,12 @@ static void dce_v11_0_audio_write_speaker_allocation(struct drm_encoder *encoder static void dce_v11_0_audio_write_sad_regs(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; struct cea_sad *sads; int i, sad_count; @@ -1365,12 +1381,14 @@ static void dce_v11_0_audio_write_sad_regs(struct drm_encoder *encoder) if (!dig || !dig->afmt || !dig->afmt->pin) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c index 59774bc8adf3..db15a112becc 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c @@ -281,9 +281,11 @@ static void dce_v6_0_hpd_init(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -309,7 +311,7 @@ static void dce_v6_0_hpd_init(struct amdgpu_device *adev) dce_v6_0_hpd_set_polarity(adev, amdgpu_connector->hpd.hpd); amdgpu_irq_get(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } - + drm_connector_list_iter_end(&iter); } /** @@ -324,9 +326,11 @@ static void dce_v6_0_hpd_fini(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -338,6 +342,7 @@ static void dce_v6_0_hpd_fini(struct amdgpu_device *adev) amdgpu_irq_put(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } static u32 dce_v6_0_hpd_get_gpio_reg(struct amdgpu_device *adev) @@ -1124,20 +1129,24 @@ static void dce_v6_0_audio_select_pin(struct drm_encoder *encoder) static void dce_v6_0_audio_write_latency_fields(struct drm_encoder *encoder, struct drm_display_mode *mode) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; int interlace = 0; u32 tmp; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1164,21 +1173,25 @@ static void dce_v6_0_audio_write_latency_fields(struct drm_encoder *encoder, static void dce_v6_0_audio_write_speaker_allocation(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u8 *sadb = NULL; int sad_count; u32 tmp; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1221,10 +1234,12 @@ static void dce_v6_0_audio_write_speaker_allocation(struct drm_encoder *encoder) static void dce_v6_0_audio_write_sad_regs(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; struct cea_sad *sads; int i, sad_count; @@ -1244,12 +1259,14 @@ static void dce_v6_0_audio_write_sad_regs(struct drm_encoder *encoder) { ixAZALIA_F0_CODEC_PIN_CONTROL_AUDIO_DESCRIPTOR13, HDMI_AUDIO_CODING_TYPE_WMA_PRO }, }; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1632,6 +1649,7 @@ static void dce_v6_0_afmt_setmode(struct drm_encoder *encoder, struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; int em = amdgpu_atombios_encoder_get_encoder_mode(encoder); int bpc = 8; @@ -1639,12 +1657,14 @@ static void dce_v6_0_afmt_setmode(struct drm_encoder *encoder, if (!dig || !dig->afmt) return; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c index fba00c08f3e7..f06c9022c1fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/dce_v8_0.c @@ -275,9 +275,11 @@ static void dce_v8_0_hpd_init(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -303,6 +305,7 @@ static void dce_v8_0_hpd_init(struct amdgpu_device *adev) dce_v8_0_hpd_set_polarity(adev, amdgpu_connector->hpd.hpd); amdgpu_irq_get(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } /** @@ -317,9 +320,11 @@ static void dce_v8_0_hpd_fini(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; u32 tmp; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_connector *amdgpu_connector = to_amdgpu_connector(connector); if (amdgpu_connector->hpd.hpd >= adev->mode_info.num_hpd) @@ -331,6 +336,7 @@ static void dce_v8_0_hpd_fini(struct amdgpu_device *adev) amdgpu_irq_put(adev, &adev->hpd_irq, amdgpu_connector->hpd.hpd); } + drm_connector_list_iter_end(&iter); } static u32 dce_v8_0_hpd_get_gpio_reg(struct amdgpu_device *adev) @@ -1157,10 +1163,12 @@ static void dce_v8_0_afmt_audio_select_pin(struct drm_encoder *encoder) static void dce_v8_0_audio_write_latency_fields(struct drm_encoder *encoder, struct drm_display_mode *mode) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 tmp = 0, offset; @@ -1169,12 +1177,14 @@ static void dce_v8_0_audio_write_latency_fields(struct drm_encoder *encoder, offset = dig->afmt->pin->offset; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1214,10 +1224,12 @@ static void dce_v8_0_audio_write_latency_fields(struct drm_encoder *encoder, static void dce_v8_0_audio_write_speaker_allocation(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; u32 offset, tmp; u8 *sadb = NULL; @@ -1228,12 +1240,14 @@ static void dce_v8_0_audio_write_speaker_allocation(struct drm_encoder *encoder) offset = dig->afmt->pin->offset; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); @@ -1263,11 +1277,13 @@ static void dce_v8_0_audio_write_speaker_allocation(struct drm_encoder *encoder) static void dce_v8_0_audio_write_sad_regs(struct drm_encoder *encoder) { - struct amdgpu_device *adev = encoder->dev->dev_private; + struct drm_device *dev = encoder->dev; + struct amdgpu_device *adev = dev->dev_private; struct amdgpu_encoder *amdgpu_encoder = to_amdgpu_encoder(encoder); struct amdgpu_encoder_atom_dig *dig = amdgpu_encoder->enc_priv; u32 offset; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct amdgpu_connector *amdgpu_connector = NULL; struct cea_sad *sads; int i, sad_count; @@ -1292,12 +1308,14 @@ static void dce_v8_0_audio_write_sad_regs(struct drm_encoder *encoder) offset = dig->afmt->pin->offset; - list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { if (connector->encoder == encoder) { amdgpu_connector = to_amdgpu_connector(connector); break; } } + drm_connector_list_iter_end(&iter); if (!amdgpu_connector) { DRM_ERROR("Couldn't find encoder's connector\n"); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 2c71951c761a..de1cea64c779 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -946,27 +946,29 @@ static int detect_mst_link_for_all_connectors(struct drm_device *dev) { struct amdgpu_dm_connector *aconnector; struct drm_connector *connector; + struct drm_connector_list_iter iter; int ret = 0; - drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); - - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { aconnector = to_amdgpu_dm_connector(connector); if (aconnector->dc_link->type == dc_connection_mst_branch && aconnector->mst_mgr.aux) { DRM_DEBUG_DRIVER("DM_MST: starting TM on aconnector: %p [id: %d]\n", - aconnector, aconnector->base.base.id); + aconnector, + aconnector->base.base.id); ret = drm_dp_mst_topology_mgr_set_mst(&aconnector->mst_mgr, true); if (ret < 0) { DRM_ERROR("DM_MST: Failed to start MST\n"); - ((struct dc_link *)aconnector->dc_link)->type = dc_connection_single; - return ret; - } + aconnector->dc_link->type = + dc_connection_single; + break; } + } } + drm_connector_list_iter_end(&iter); - drm_modeset_unlock(&dev->mode_config.connection_mutex); return ret; } @@ -1009,14 +1011,13 @@ static void s3_handle_mst(struct drm_device *dev, bool suspend) { struct amdgpu_dm_connector *aconnector; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct drm_dp_mst_topology_mgr *mgr; int ret; bool need_hotplug = false; - drm_modeset_lock(&dev->mode_config.connection_mutex, NULL); - - list_for_each_entry(connector, &dev->mode_config.connector_list, - head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { aconnector = to_amdgpu_dm_connector(connector); if (aconnector->dc_link->type != dc_connection_mst_branch || aconnector->mst_port) @@ -1034,8 +1035,7 @@ static void s3_handle_mst(struct drm_device *dev, bool suspend) } } } - - drm_modeset_unlock(&dev->mode_config.connection_mutex); + drm_connector_list_iter_end(&iter); if (need_hotplug) drm_kms_helper_hotplug_event(dev); @@ -1217,6 +1217,7 @@ static int dm_resume(void *handle) struct amdgpu_display_manager *dm = &adev->dm; struct amdgpu_dm_connector *aconnector; struct drm_connector *connector; + struct drm_connector_list_iter iter; struct drm_crtc *crtc; struct drm_crtc_state *new_crtc_state; struct dm_crtc_state *dm_new_crtc_state; @@ -1249,7 +1250,8 @@ static int dm_resume(void *handle) s3_handle_mst(ddev, false); /* Do detection*/ - list_for_each_entry(connector, &ddev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(ddev, &iter); + drm_for_each_connector_iter(connector, &iter) { aconnector = to_amdgpu_dm_connector(connector); /* @@ -1277,6 +1279,7 @@ static int dm_resume(void *handle) amdgpu_dm_update_connector_after_detect(aconnector); mutex_unlock(&aconnector->hpd_lock); } + drm_connector_list_iter_end(&iter); /* Force mode set in atomic commit */ for_each_new_crtc_in_state(dm->cached_state, crtc, new_crtc_state, i) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c index fa5d503d379c..64445c4cc4c2 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_irq.c @@ -732,8 +732,10 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_dm_connector *amdgpu_dm_connector = to_amdgpu_dm_connector(connector); @@ -751,6 +753,7 @@ void amdgpu_dm_hpd_init(struct amdgpu_device *adev) true); } } + drm_connector_list_iter_end(&iter); } /** @@ -765,8 +768,10 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) { struct drm_device *dev = adev->ddev; struct drm_connector *connector; + struct drm_connector_list_iter iter; - list_for_each_entry(connector, &dev->mode_config.connector_list, head) { + drm_connector_list_iter_begin(dev, &iter); + drm_for_each_connector_iter(connector, &iter) { struct amdgpu_dm_connector *amdgpu_dm_connector = to_amdgpu_dm_connector(connector); const struct dc_link *dc_link = amdgpu_dm_connector->dc_link; @@ -779,4 +784,5 @@ void amdgpu_dm_hpd_fini(struct amdgpu_device *adev) false); } } + drm_connector_list_iter_end(&iter); } -- cgit v1.2.3 From 71f98027f2c399805ad513f6675b88c92eea3302 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 13:36:09 -0500 Subject: drm/amdgpu: move amdgpu_device_get_job_timeout_settings It's only used in amdgpu_device.c and the naming also reflects that. Move it there. Reviewed-by: Andrey Grodzovsky Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 64 ++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 67 ------------------------------ 3 files changed, 67 insertions(+), 68 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 578cd81814a0..c82579c93265 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -107,6 +107,8 @@ struct amdgpu_mgpu_info uint32_t num_apu; }; +#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 + /* * Modules parameters. */ @@ -123,6 +125,7 @@ extern int amdgpu_disp_priority; extern int amdgpu_hw_i2c; extern int amdgpu_pcie_gen2; extern int amdgpu_msi; +extern char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; extern int amdgpu_dpm; extern int amdgpu_fw_load_type; extern int amdgpu_aspm; @@ -427,7 +430,6 @@ struct amdgpu_fpriv { }; int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv); -int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev); int amdgpu_ib_get(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned size, struct amdgpu_ib *ib); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62955156653c..f25275abf408 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2564,6 +2564,70 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) adev->asic_reset_res, adev->ddev->unique); } +static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) +{ + char *input = amdgpu_lockup_timeout; + char *timeout_setting = NULL; + int index = 0; + long timeout; + int ret = 0; + + /* + * By default timeout for non compute jobs is 10000. + * And there is no timeout enforced on compute jobs. + * In SR-IOV or passthrough mode, timeout for compute + * jobs are 10000 by default. + */ + adev->gfx_timeout = msecs_to_jiffies(10000); + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + adev->compute_timeout = adev->gfx_timeout; + else + adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; + + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + while ((timeout_setting = strsep(&input, ",")) && + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + ret = kstrtol(timeout_setting, 0, &timeout); + if (ret) + return ret; + + if (timeout == 0) { + index++; + continue; + } else if (timeout < 0) { + timeout = MAX_SCHEDULE_TIMEOUT; + } else { + timeout = msecs_to_jiffies(timeout); + } + + switch (index++) { + case 0: + adev->gfx_timeout = timeout; + break; + case 1: + adev->compute_timeout = timeout; + break; + case 2: + adev->sdma_timeout = timeout; + break; + case 3: + adev->video_timeout = timeout; + break; + default: + break; + } + } + /* + * There is only one value specified and + * it should apply to all non-compute jobs. + */ + if (index == 1) + adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + } + + return ret; +} /** * amdgpu_device_init - initialize the driver diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 2ef72309e5ab..d43c46de7807 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -88,8 +88,6 @@ #define KMS_DRIVER_MINOR 35 #define KMS_DRIVER_PATCHLEVEL 0 -#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 - int amdgpu_vram_limit = 0; int amdgpu_vis_vram_limit = 0; int amdgpu_gart_size = -1; /* auto */ @@ -1293,71 +1291,6 @@ int amdgpu_file_to_fpriv(struct file *filp, struct amdgpu_fpriv **fpriv) return 0; } -int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) -{ - char *input = amdgpu_lockup_timeout; - char *timeout_setting = NULL; - int index = 0; - long timeout; - int ret = 0; - - /* - * By default timeout for non compute jobs is 10000. - * And there is no timeout enforced on compute jobs. - * In SR-IOV or passthrough mode, timeout for compute - * jobs are 10000 by default. - */ - adev->gfx_timeout = msecs_to_jiffies(10000); - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) - adev->compute_timeout = adev->gfx_timeout; - else - adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; - - if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { - while ((timeout_setting = strsep(&input, ",")) && - strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { - ret = kstrtol(timeout_setting, 0, &timeout); - if (ret) - return ret; - - if (timeout == 0) { - index++; - continue; - } else if (timeout < 0) { - timeout = MAX_SCHEDULE_TIMEOUT; - } else { - timeout = msecs_to_jiffies(timeout); - } - - switch (index++) { - case 0: - adev->gfx_timeout = timeout; - break; - case 1: - adev->compute_timeout = timeout; - break; - case 2: - adev->sdma_timeout = timeout; - break; - case 3: - adev->video_timeout = timeout; - break; - default: - break; - } - } - /* - * There is only one value specified and - * it should apply to all non-compute jobs. - */ - if (index == 1) - adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; - } - - return ret; -} - static bool amdgpu_get_crtc_scanout_position(struct drm_device *dev, unsigned int pipe, bool in_vblank_irq, int *vpos, int *hpos, -- cgit v1.2.3 From bcccee89f48c1e96b2aea067507e89f3581693bb Mon Sep 17 00:00:00 2001 From: Emily Deng Date: Tue, 15 Oct 2019 10:08:22 +0800 Subject: drm/amdgpu: Fix tdr3 could hang with slow compute issue When index is 1, need to set compute ring timeout for sriov and passthrough. Signed-off-by: Emily Deng Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 ++++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++-- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index f25275abf408..75a0db394e21 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2622,8 +2622,11 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) * There is only one value specified and * it should apply to all non-compute jobs. */ - if (index == 1) + if (index == 1) { adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout; + if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev)) + adev->compute_timeout = adev->gfx_timeout; + } } return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0473329548aa..62256fe70abd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -249,9 +249,11 @@ module_param_named(msi, amdgpu_msi, int, 0444); * By default(with no lockup_timeout settings), the timeout for all non-compute(GFX, SDMA and Video) * jobs is 10000. And there is no timeout enforced on compute jobs. */ -MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: 10000 for non-compute jobs and infinity timeout for compute jobs." +MODULE_PARM_DESC(lockup_timeout, "GPU lockup timeout in ms (default: for bare metal 10000 for non-compute jobs and infinity timeout for compute jobs; " + "for passthrough or sriov, 10000 for all jobs." " 0: keep default value. negative: infinity timeout), " - "format is [Non-Compute] or [GFX,Compute,SDMA,Video]"); + "format: for bare metal [Non-Compute] or [GFX,Compute,SDMA,Video]; " + "for passthrough or sriov [all jobs] or [GFX,Compute,SDMA,Video]."); module_param_string(lockup_timeout, amdgpu_lockup_timeout, sizeof(amdgpu_lockup_timeout), 0444); /** -- cgit v1.2.3 From 803cc26d5cb6a3c93eab9124c1e218dc93066a6c Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 14:33:39 -0500 Subject: drm/amdgpu: move pci_save_state into suspend path for amdgpu_device_suspend. This follows the logic in the resume path. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 75a0db394e21..9532338ae164 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3171,8 +3171,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) */ amdgpu_bo_evict_vram(adev); - pci_save_state(dev->pdev); if (suspend) { + pci_save_state(dev->pdev); /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3hot); -- cgit v1.2.3 From 897483d8a0e0896b214bda374e67373ee01166a6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Fri, 4 Oct 2019 14:57:21 -0500 Subject: drm/amdgpu: move gpu reset out of amdgpu_device_suspend Move it into the caller. There are cases were we don't want it. We need it for hibernation, but we don't need it for runtime pm, so drop it for runtime pm. Reviewed-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ---- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 7 ++++++- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 9532338ae164..b29bdf22c8bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3176,10 +3176,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon) /* Shut down the device */ pci_disable_device(dev->pdev); pci_set_power_state(dev->pdev, PCI_D3hot); - } else { - r = amdgpu_asic_reset(adev); - if (r) - DRM_ERROR("amdgpu asic reset failed\n"); } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 62256fe70abd..1094a98fb6f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1155,8 +1155,13 @@ static int amdgpu_pmops_resume(struct device *dev) static int amdgpu_pmops_freeze(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_dev->dev_private; + int r; - return amdgpu_device_suspend(drm_dev, false, true); + r = amdgpu_device_suspend(drm_dev, false, true); + if (r) + return r; + return amdgpu_asic_reset(adev); } static int amdgpu_pmops_thaw(struct device *dev) -- cgit v1.2.3 From e35e2b117f4dba2761d96e378c28734f6807f227 Mon Sep 17 00:00:00 2001 From: "Tianci.Yin" Date: Mon, 30 Sep 2019 13:33:50 +0800 Subject: drm/amdgpu: add a generic fb accessing helper function(v3) add a generic helper function for accessing framebuffer via MMIO Reviewed-by: Alex Deucher Reviewed-by: Luben Tuikov Signed-off-by: Tianci.Yin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 30 +++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 12 +---------- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index b42fa21de806..ef195f293000 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -981,6 +981,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, void amdgpu_device_fini(struct amdgpu_device *adev); int amdgpu_gpu_wait_for_idle(struct amdgpu_device *adev); +void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, + uint32_t *buf, size_t size, bool write); uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, uint32_t acc_flags); void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b29bdf22c8bf..46218b36dc9e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -153,6 +153,36 @@ bool amdgpu_device_is_px(struct drm_device *dev) return false; } +/** + * VRAM access helper functions. + * + * amdgpu_device_vram_access - read/write a buffer in vram + * + * @adev: amdgpu_device pointer + * @pos: offset of the buffer in vram + * @buf: virtual address of the buffer in system memory + * @size: read/write size, sizeof(@buf) must > @size + * @write: true - write to vram, otherwise - read from vram + */ +void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos, + uint32_t *buf, size_t size, bool write) +{ + uint64_t last; + unsigned long flags; + + last = size - 4; + for (last += pos; pos <= last; pos += 4) { + spin_lock_irqsave(&adev->mmio_idx_lock, flags); + WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); + WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); + if (write) + WREG32_NO_KIQ(mmMM_DATA, *buf++); + else + *buf++ = RREG32_NO_KIQ(mmMM_DATA); + spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); + } +} + /* * MMIO register access helper functions. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c index ddd8364102a2..f95092741c38 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c @@ -134,20 +134,10 @@ static int hw_id_map[MAX_HWIP] = { static int amdgpu_discovery_read_binary(struct amdgpu_device *adev, uint8_t *binary) { - uint32_t *p = (uint32_t *)binary; uint64_t vram_size = (uint64_t)RREG32(mmRCC_CONFIG_MEMSIZE) << 20; uint64_t pos = vram_size - DISCOVERY_TMR_SIZE; - unsigned long flags; - - while (pos < vram_size) { - spin_lock_irqsave(&adev->mmio_idx_lock, flags); - WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000); - WREG32_NO_KIQ(mmMM_INDEX_HI, pos >> 31); - *p++ = RREG32_NO_KIQ(mmMM_DATA); - spin_unlock_irqrestore(&adev->mmio_idx_lock, flags); - pos += 4; - } + amdgpu_device_vram_access(adev, pos, (uint32_t *)binary, DISCOVERY_TMR_SIZE, false); return 0; } -- cgit v1.2.3 From 121a2bc6ae786f59c7327a2b54c8bed5779e67e6 Mon Sep 17 00:00:00 2001 From: Andrey Grodzovsky Date: Fri, 18 Oct 2019 16:15:04 -0400 Subject: drm/amdgpu: Move amdgpu_ras_recovery_init to after SMU ready. For Arcturus the I2C traffic is done through SMU tables and so we must postpone RAS recovery init to after they are ready which is in amdgpu_device_ip_hw_init_phase2. Signed-off-by: Andrey Grodzovsky Reviewed-by: Alex Deucher Reviewed-by: Evan Quan Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 +++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 11 ----------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 46218b36dc9e..83d80070f0e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1877,6 +1877,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) if (r) goto init_failed; + /* + * retired pages will be loaded from eeprom and reserved here, + * it should be called after amdgpu_device_ip_hw_init_phase2 since + * for some ASICs the RAS EEPROM code relies on SMU fully functioning + * for I2C communication which only true at this point. + * recovery_init may fail, but it can free all resources allocated by + * itself and its failure should not stop amdgpu init process. + * + * Note: theoretically, this should be called before all vram allocations + * to protect retired page from abusing + */ + amdgpu_ras_recovery_init(adev); + if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); amdgpu_amdkfd_device_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 69e54308f080..41eb2af59445 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1802,17 +1802,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) adev->gmc.visible_vram_size); #endif - /* - * retired pages will be loaded from eeprom and reserved here, - * it should be called after ttm init since new bo may be created, - * recovery_init may fail, but it can free all resources allocated by - * itself and its failure should not stop amdgpu init process. - * - * Note: theoretically, this should be called before all vram allocations - * to protect retired page from abusing - */ - amdgpu_ras_recovery_init(adev); - /* *The reserved vram for firmware must be pinned to the specified *place on the VRAM, so reserve it early. -- cgit v1.2.3 From f440ff44b101796beec8ac63252f254de7f2de1c Mon Sep 17 00:00:00 2001 From: Wambui Karuga Date: Mon, 28 Oct 2019 12:20:05 +0300 Subject: drm/amd: correct "_LENTH" mispelling in constant Correct the "_LENTH" mispelling in the AMDGPU_MAX_TIMEOUT_PARAM_LENGTH constant. Signed-off-by: Wambui Karuga Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index c9acf824f0be..0bb08e0e1611 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -107,7 +107,7 @@ struct amdgpu_mgpu_info uint32_t num_apu; }; -#define AMDGPU_MAX_TIMEOUT_PARAM_LENTH 256 +#define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256 /* * Modules parameters. @@ -125,7 +125,7 @@ extern int amdgpu_disp_priority; extern int amdgpu_hw_i2c; extern int amdgpu_pcie_gen2; extern int amdgpu_msi; -extern char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; +extern char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENGTH]; extern int amdgpu_dpm; extern int amdgpu_fw_load_type; extern int amdgpu_aspm; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 83d80070f0e9..c03089503b0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2628,9 +2628,9 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev) else adev->compute_timeout = MAX_SCHEDULE_TIMEOUT; - if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { while ((timeout_setting = strsep(&input, ",")) && - strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENTH)) { + strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) { ret = kstrtol(timeout_setting, 0, &timeout); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 3378b4209162..ab753e491937 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -101,7 +101,7 @@ int amdgpu_disp_priority = 0; int amdgpu_hw_i2c = 0; int amdgpu_pcie_gen2 = -1; int amdgpu_msi = -1; -char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENTH]; +char amdgpu_lockup_timeout[AMDGPU_MAX_TIMEOUT_PARAM_LENGTH]; int amdgpu_dpm = -1; int amdgpu_fw_load_type = -1; int amdgpu_aspm = -1; -- cgit v1.2.3 From bff77e86a3776fab6859bc168ecda6ccf56bfbd2 Mon Sep 17 00:00:00 2001 From: Le Ma Date: Fri, 25 Oct 2019 17:48:52 +0800 Subject: drm/amdgpu: bypass some cleanup work after err_event_athub (v2) PSP lost connection when err_event_athub occurs. These cleanup work can be skipped in BACO reset. v2: squash in missing include (Alex) Signed-off-by: Le Ma Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 ++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++--------- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 ++++-- 4 files changed, 30 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c03089503b0f..d36d2b093539 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2271,6 +2271,12 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) /* displays are handled in phase1 */ if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) continue; + /* PSP lost connection when err_event_athub occurs */ + if (amdgpu_ras_intr_triggered() && + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) { + adev->ip_blocks[i].status.hw = false; + continue; + } /* XXX handle errors */ r = adev->ip_blocks[i].version->funcs->suspend(adev); /* XXX handle errors */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index fd7a73f4fa70..bbe9ac7e843f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -34,6 +34,8 @@ #include "psp_v11_0.h" #include "psp_v12_0.h" +#include "amdgpu_ras.h" + static void psp_set_funcs(struct amdgpu_device *adev); static int psp_early_init(void *handle) @@ -167,6 +169,13 @@ psp_cmd_submit_buf(struct psp_context *psp, while (*((unsigned int *)psp->fence_buf) != index) { if (--timeout == 0) break; + /* + * Shouldn't wait for timeout when err_event_athub occurs, + * because gpu reset thread triggered and lock resource should + * be released for psp resume sequence. + */ + if (amdgpu_ras_intr_triggered()) + break; msleep(1); amdgpu_asic_invalidate_hdp(psp->adev, NULL); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 796326b36e00..dab90c280476 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -558,15 +558,17 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head))) return 0; - ret = psp_ras_enable_features(&adev->psp, &info, enable); - if (ret) { - DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", - enable ? "enable":"disable", - ras_block_str(head->block), - ret); - if (ret == TA_RAS_STATUS__RESET_NEEDED) - return -EAGAIN; - return -EINVAL; + if (!amdgpu_ras_intr_triggered()) { + ret = psp_ras_enable_features(&adev->psp, &info, enable); + if (ret) { + DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n", + enable ? "enable":"disable", + ras_block_str(head->block), + ret); + if (ret == TA_RAS_STATUS__RESET_NEEDED) + return -EAGAIN; + return -EINVAL; + } } /* setup the obj */ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 088c6a734a1a..d694be9a8c39 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3736,8 +3736,10 @@ static int gfx_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); - /* disable KCQ to avoid CPC touch memory not valid anymore */ - gfx_v9_0_kcq_disable(adev); + /* DF freeze and kcq disable will fail */ + if (!amdgpu_ras_intr_triggered()) + /* disable KCQ to avoid CPC touch memory not valid anymore */ + gfx_v9_0_kcq_disable(adev); if (amdgpu_sriov_vf(adev)) { gfx_v9_0_cp_gfx_enable(adev, false); -- cgit v1.2.3 From 39ea6e5f9e2d6e094b7a48592ecae6c69a82e471 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 31 Oct 2019 14:10:27 +0800 Subject: drm/amdgpu: change pstate only after all XGMI device initialized Pstate settings should be performed after all device of the XGMI setup get initialized. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d36d2b093539..40ff06b99e5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2082,9 +2082,6 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (r) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); - /* set to low pstate by default */ - amdgpu_xgmi_set_pstate(adev, 0); - return 0; } @@ -2197,6 +2194,18 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) r = amdgpu_ib_ring_tests(adev); if (r) DRM_ERROR("ib ring test failed (%d).\n", r); + + /* + * set to low pstate by default + * This should be performed after all devices from + * XGMI finish their initializations. Thus it's moved + * to here. + * The time delay is 2S. TODO: confirm whether that + * is enough for all possible XGMI setups. + */ + r = amdgpu_xgmi_set_pstate(adev, 0); + if (r) + DRM_ERROR("pstate setting failed (%d).\n", r); } static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) -- cgit v1.2.3 From b0adca4d50169d9a05912e208a89cdd615da40d4 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 5 Nov 2019 18:13:49 +0800 Subject: drm/amdgpu: register gpu instance before fan boost feature enablment Otherwise, the feature enablement will be skipped due to wrong count. Fixes: beff74bc6e0fa91 ("drm/amdgpu: fix a race in GPU reset with IB test (v2)") Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 7 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 1 - 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 40ff06b99e5b..7b33058c4e03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3029,6 +3029,13 @@ fence_driver_init: DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n"); } + /* + * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost. + * Otherwise the mgpu fan boost feature will be skipped due to the + * gpu instance is counted less. + */ + amdgpu_register_gpu_instance(adev); + /* enable clockgating, etc. after ib tests, etc. since some blocks require * explicit gating rather than handling it automatically. */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 137a8573d556..82c46c4faaad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -190,7 +190,6 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags) pm_runtime_put_autosuspend(dev->dev); } - amdgpu_register_gpu_instance(adev); out: if (r) { /* balance pm_runtime_get_sync in amdgpu_driver_unload_kms */ -- cgit v1.2.3 From 60599a03638a611574a177fa6e6d7394abad0d7f Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Tue, 5 Nov 2019 15:15:33 +0800 Subject: drm/amdgpu: perform p-state switch after the whole hive initialized P-state switch should be performed after all devices from the hive get initialized. Signed-off-by: Evan Quan Reviewed-by: Feifei Xu Reviewed-by: Jonathan Kim Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 47 ++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 7b33058c4e03..58f6b3b92831 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2057,6 +2057,7 @@ out: */ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) { + struct amdgpu_gpu_instance *gpu_instance; int i = 0, r; for (i = 0; i < adev->num_ip_blocks; i++) { @@ -2082,6 +2083,40 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev) if (r) DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); + + if (adev->gmc.xgmi.num_physical_nodes > 1) { + mutex_lock(&mgpu_info.mutex); + + /* + * Reset device p-state to low as this was booted with high. + * + * This should be performed only after all devices from the same + * hive get initialized. + * + * However, it's unknown how many device in the hive in advance. + * As this is counted one by one during devices initializations. + * + * So, we wait for all XGMI interlinked devices initialized. + * This may bring some delays as those devices may come from + * different hives. But that should be OK. + */ + if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) { + for (i = 0; i < mgpu_info.num_gpu; i++) { + gpu_instance = &(mgpu_info.gpu_ins[i]); + if (gpu_instance->adev->flags & AMD_IS_APU) + continue; + + r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0); + if (r) { + DRM_ERROR("pstate setting failed (%d).\n", r); + break; + } + } + } + + mutex_unlock(&mgpu_info.mutex); + } + return 0; } @@ -2194,18 +2229,6 @@ static void amdgpu_device_delayed_init_work_handler(struct work_struct *work) r = amdgpu_ib_ring_tests(adev); if (r) DRM_ERROR("ib ring test failed (%d).\n", r); - - /* - * set to low pstate by default - * This should be performed after all devices from - * XGMI finish their initializations. Thus it's moved - * to here. - * The time delay is 2S. TODO: confirm whether that - * is enough for all possible XGMI setups. - */ - r = amdgpu_xgmi_set_pstate(adev, 0); - if (r) - DRM_ERROR("pstate setting failed (%d).\n", r); } static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work) -- cgit v1.2.3 From 9f87516764a9f2a6a6a9552844efb41d9748afec Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Fri, 8 Nov 2019 18:06:07 +0800 Subject: drm/amd/amdgpu: finish delay works before release resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit flush/cancel delayed works before doing finalization to avoid concurrently requests. Signed-off-by: Jesse Zhang Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 1 + 3 files changed, 5 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 58f6b3b92831..8ff69a5c2327 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3110,6 +3110,9 @@ void amdgpu_device_fini(struct amdgpu_device *adev) DRM_INFO("amdgpu: finishing device.\n"); adev->shutdown = true; + + flush_delayed_work(&adev->delayed_init_work); + /* disable all interrupts */ amdgpu_irq_disable_all(adev); if (adev->mode_info.mode_config_initialized){ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index b4dd89af6f09..e324bfe6c58f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -299,6 +299,7 @@ int amdgpu_uvd_sw_fini(struct amdgpu_device *adev) { int i, j; + cancel_delayed_work_sync(&adev->uvd.idle_work); drm_sched_entity_destroy(&adev->uvd.entity); for (j = 0; j < adev->uvd.num_uvd_inst; ++j) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index 703677f2ff6f..46b590af2fd2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -216,6 +216,7 @@ int amdgpu_vce_sw_fini(struct amdgpu_device *adev) if (adev->vce.vcpu_bo == NULL) return 0; + cancel_delayed_work_sync(&adev->vce.idle_work); drm_sched_entity_destroy(&adev->vce.entity); amdgpu_bo_free_kernel(&adev->vce.vcpu_bo, &adev->vce.gpu_addr, -- cgit v1.2.3 From c0e21ea1d0b557bdedd5b54d529162f74e7ef407 Mon Sep 17 00:00:00 2001 From: Yintian Tao Date: Mon, 18 Nov 2019 16:06:00 +0800 Subject: drm/amdgpu: put flush_delayed_work at first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is one regression from 042f3d7b745cd76aa To put flush_delayed_work after adev->shutdown = true which will make amdgpu_ih_process not response the irq At last, all ib ring tests will be failed just like below [drm] amdgpu: finishing device. [drm] Fence fallback timer expired on ring gfx [drm] Fence fallback timer expired on ring comp_1.0.0 [drm] Fence fallback timer expired on ring comp_1.1.0 [drm] Fence fallback timer expired on ring comp_1.2.0 [drm] Fence fallback timer expired on ring comp_1.3.0 [drm] Fence fallback timer expired on ring comp_1.0.1 amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.1.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.2.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.3.1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on sdma0 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on sdma1 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on uvd_enc_0.0 (-110). amdgpu 0000:00:07.0: [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on vce0 (-110). [drm:amdgpu_device_delayed_init_work_handler [amdgpu]] *ERROR* ib ring test failed (-110). v2: replace cancel_delayed_work_sync() with flush_delayed_work() Signed-off-by: Yintian Tao Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8ff69a5c2327..4f76beafb2fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3109,9 +3109,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev) int r; DRM_INFO("amdgpu: finishing device.\n"); - adev->shutdown = true; - flush_delayed_work(&adev->delayed_init_work); + adev->shutdown = true; /* disable all interrupts */ amdgpu_irq_disable_all(adev); @@ -3130,7 +3129,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) adev->firmware.gpu_info_fw = NULL; } adev->accel_working = false; - cancel_delayed_work_sync(&adev->delayed_init_work); /* free i2c buses */ if (!amdgpu_device_has_dc_support(adev)) amdgpu_i2c_fini(adev); -- cgit v1.2.3 From 62914a99dee5ac51253a84e7d4a05c18f0c77535 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 12 Nov 2019 16:22:28 -0400 Subject: drm/amdgpu: Use mmu_interval_insert instead of hmm_mirror Remove the interval tree in the driver and rely on the tree maintained by the mmu_notifier for delivering mmu_notifier invalidation callbacks. For some reason amdgpu has a very complicated arrangement where it tries to prevent duplicate entries in the interval_tree, this is not necessary, each amdgpu_bo can be its own stand alone entry. interval_tree already allows duplicates and overlaps in the tree. Also, there is no need to remove entries upon a release callback, the mmu_interval API safely allows objects to remain registered beyond the lifetime of the mm. The driver only has to stop touching the pages during release. Link: https://lore.kernel.org/r/20191112202231.3856-12-jgg@ziepe.ca Reviewed-by: Philip Yang Tested-by: Philip Yang Signed-off-by: Jason Gunthorpe --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 333 +++++------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h | 4 - drivers/gpu/drm/amd/amdgpu/amdgpu_object.h | 13 +- 6 files changed, 77 insertions(+), 281 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bd37df5dd6d0..60591a5d4200 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1006,6 +1006,8 @@ struct amdgpu_device { struct mutex lock_reset; struct amdgpu_doorbell_index doorbell_index; + struct mutex notifier_lock; + int asic_reset_res; struct work_struct xgmi_reset_work; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 6d021ecc8d59..47700302a08b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -481,8 +481,7 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem, * * Returns 0 for success, negative errno for errors. */ -static int init_user_pages(struct kgd_mem *mem, struct mm_struct *mm, - uint64_t user_addr) +static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr) { struct amdkfd_process_info *process_info = mem->process_info; struct amdgpu_bo *bo = mem->bo; @@ -1195,7 +1194,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr); if (user_addr) { - ret = init_user_pages(*mem, current->mm, user_addr); + ret = init_user_pages(*mem, user_addr); if (ret) goto allocate_init_user_pages_failed; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5a1939dbd4e3..38f97998aadd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2633,6 +2633,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->virt.vf_errors.lock); hash_init(adev->mn_hash); mutex_init(&adev->lock_reset); + mutex_init(&adev->notifier_lock); mutex_init(&adev->virt.dpm_mutex); mutex_init(&adev->psp.mutex); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index 31d4deb5d294..9fe1c31ce17a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -50,66 +50,6 @@ #include "amdgpu.h" #include "amdgpu_amdkfd.h" -/** - * struct amdgpu_mn_node - * - * @it: interval node defining start-last of the affected address range - * @bos: list of all BOs in the affected address range - * - * Manages all BOs which are affected of a certain range of address space. - */ -struct amdgpu_mn_node { - struct interval_tree_node it; - struct list_head bos; -}; - -/** - * amdgpu_mn_destroy - destroy the HMM mirror - * - * @work: previously sheduled work item - * - * Lazy destroys the notifier from a work item - */ -static void amdgpu_mn_destroy(struct work_struct *work) -{ - struct amdgpu_mn *amn = container_of(work, struct amdgpu_mn, work); - struct amdgpu_device *adev = amn->adev; - struct amdgpu_mn_node *node, *next_node; - struct amdgpu_bo *bo, *next_bo; - - mutex_lock(&adev->mn_lock); - down_write(&amn->lock); - hash_del(&amn->node); - rbtree_postorder_for_each_entry_safe(node, next_node, - &amn->objects.rb_root, it.rb) { - list_for_each_entry_safe(bo, next_bo, &node->bos, mn_list) { - bo->mn = NULL; - list_del_init(&bo->mn_list); - } - kfree(node); - } - up_write(&amn->lock); - mutex_unlock(&adev->mn_lock); - - hmm_mirror_unregister(&amn->mirror); - kfree(amn); -} - -/** - * amdgpu_hmm_mirror_release - callback to notify about mm destruction - * - * @mirror: the HMM mirror (mm) this callback is about - * - * Shedule a work item to lazy destroy HMM mirror. - */ -static void amdgpu_hmm_mirror_release(struct hmm_mirror *mirror) -{ - struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); - - INIT_WORK(&amn->work, amdgpu_mn_destroy); - schedule_work(&amn->work); -} - /** * amdgpu_mn_lock - take the write side lock for this notifier * @@ -133,157 +73,80 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn) } /** - * amdgpu_mn_read_lock - take the read side lock for this notifier - * - * @amn: our notifier - */ -static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable) -{ - if (blockable) - down_read(&amn->lock); - else if (!down_read_trylock(&amn->lock)) - return -EAGAIN; - - return 0; -} - -/** - * amdgpu_mn_read_unlock - drop the read side lock for this notifier - * - * @amn: our notifier - */ -static void amdgpu_mn_read_unlock(struct amdgpu_mn *amn) -{ - up_read(&amn->lock); -} - -/** - * amdgpu_mn_invalidate_node - unmap all BOs of a node + * amdgpu_mn_invalidate_gfx - callback to notify about mm change * - * @node: the node with the BOs to unmap - * @start: start of address range affected - * @end: end of address range affected + * @mni: the range (mm) is about to update + * @range: details on the invalidation + * @cur_seq: Value to pass to mmu_interval_set_seq() * * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, - unsigned long start, - unsigned long end) +static bool amdgpu_mn_invalidate_gfx(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { - struct amdgpu_bo *bo; + struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier); + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); long r; - list_for_each_entry(bo, &node->bos, mn_list) { - - if (!amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, start, end)) - continue; + if (!mmu_notifier_range_blockable(range)) + return false; - r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, - true, false, MAX_SCHEDULE_TIMEOUT); - if (r <= 0) - DRM_ERROR("(%ld) failed to wait for user bo\n", r); - } + mutex_lock(&adev->notifier_lock); + r = dma_resv_wait_timeout_rcu(bo->tbo.base.resv, true, false, + MAX_SCHEDULE_TIMEOUT); + mutex_unlock(&adev->notifier_lock); + if (r <= 0) + DRM_ERROR("(%ld) failed to wait for user bo\n", r); + return true; } +static const struct mmu_interval_notifier_ops amdgpu_mn_gfx_ops = { + .invalidate = amdgpu_mn_invalidate_gfx, +}; + /** - * amdgpu_mn_sync_pagetables_gfx - callback to notify about mm change + * amdgpu_mn_invalidate_hsa - callback to notify about mm change * - * @mirror: the hmm_mirror (mm) is about to update - * @update: the update start, end address + * @mni: the range (mm) is about to update + * @range: details on the invalidation + * @cur_seq: Value to pass to mmu_interval_set_seq() * - * Block for operations on BOs to finish and mark pages as accessed and - * potentially dirty. + * We temporarily evict the BO attached to this range. This necessitates + * evicting all user-mode queues of the process. */ -static int -amdgpu_mn_sync_pagetables_gfx(struct hmm_mirror *mirror, - const struct mmu_notifier_range *update) +static bool amdgpu_mn_invalidate_hsa(struct mmu_interval_notifier *mni, + const struct mmu_notifier_range *range, + unsigned long cur_seq) { - struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); - unsigned long start = update->start; - unsigned long end = update->end; - bool blockable = mmu_notifier_range_blockable(update); - struct interval_tree_node *it; - - /* notification is exclusive, but interval is inclusive */ - end -= 1; - - /* TODO we should be able to split locking for interval tree and - * amdgpu_mn_invalidate_node - */ - if (amdgpu_mn_read_lock(amn, blockable)) - return -EAGAIN; - - it = interval_tree_iter_first(&amn->objects, start, end); - while (it) { - struct amdgpu_mn_node *node; - - if (!blockable) { - amdgpu_mn_read_unlock(amn); - return -EAGAIN; - } - - node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); + struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier); + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - amdgpu_mn_invalidate_node(node, start, end); - } + if (!mmu_notifier_range_blockable(range)) + return false; - amdgpu_mn_read_unlock(amn); + mutex_lock(&adev->notifier_lock); + amdgpu_amdkfd_evict_userptr(bo->kfd_bo, bo->notifier.mm); + mutex_unlock(&adev->notifier_lock); - return 0; + return true; } -/** - * amdgpu_mn_sync_pagetables_hsa - callback to notify about mm change - * - * @mirror: the hmm_mirror (mm) is about to update - * @update: the update start, end address - * - * We temporarily evict all BOs between start and end. This - * necessitates evicting all user-mode queues of the process. The BOs - * are restorted in amdgpu_mn_invalidate_range_end_hsa. - */ -static int -amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, - const struct mmu_notifier_range *update) +static const struct mmu_interval_notifier_ops amdgpu_mn_hsa_ops = { + .invalidate = amdgpu_mn_invalidate_hsa, +}; + +static int amdgpu_mn_sync_pagetables(struct hmm_mirror *mirror, + const struct mmu_notifier_range *update) { struct amdgpu_mn *amn = container_of(mirror, struct amdgpu_mn, mirror); - unsigned long start = update->start; - unsigned long end = update->end; - bool blockable = mmu_notifier_range_blockable(update); - struct interval_tree_node *it; - /* notification is exclusive, but interval is inclusive */ - end -= 1; - - if (amdgpu_mn_read_lock(amn, blockable)) + if (!mmu_notifier_range_blockable(update)) return -EAGAIN; - it = interval_tree_iter_first(&amn->objects, start, end); - while (it) { - struct amdgpu_mn_node *node; - struct amdgpu_bo *bo; - - if (!blockable) { - amdgpu_mn_read_unlock(amn); - return -EAGAIN; - } - - node = container_of(it, struct amdgpu_mn_node, it); - it = interval_tree_iter_next(it, start, end); - - list_for_each_entry(bo, &node->bos, mn_list) { - struct kgd_mem *mem = bo->kfd_bo; - - if (amdgpu_ttm_tt_affect_userptr(bo->tbo.ttm, - start, end)) - amdgpu_amdkfd_evict_userptr(mem, amn->mm); - } - } - - amdgpu_mn_read_unlock(amn); - + down_read(&amn->lock); + up_read(&amn->lock); return 0; } @@ -295,12 +158,10 @@ amdgpu_mn_sync_pagetables_hsa(struct hmm_mirror *mirror, static struct hmm_mirror_ops amdgpu_hmm_mirror_ops[] = { [AMDGPU_MN_TYPE_GFX] = { - .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_gfx, - .release = amdgpu_hmm_mirror_release + .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables, }, [AMDGPU_MN_TYPE_HSA] = { - .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables_hsa, - .release = amdgpu_hmm_mirror_release + .sync_cpu_device_pagetables = amdgpu_mn_sync_pagetables, }, }; @@ -327,7 +188,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, } hash_for_each_possible(adev->mn_hash, amn, node, key) - if (AMDGPU_MN_KEY(amn->mm, amn->type) == key) + if (AMDGPU_MN_KEY(amn->mirror.hmm->mmu_notifier.mm, + amn->type) == key) goto release_locks; amn = kzalloc(sizeof(*amn), GFP_KERNEL); @@ -337,10 +199,8 @@ struct amdgpu_mn *amdgpu_mn_get(struct amdgpu_device *adev, } amn->adev = adev; - amn->mm = mm; init_rwsem(&amn->lock); amn->type = type; - amn->objects = RB_ROOT_CACHED; amn->mirror.ops = &amdgpu_hmm_mirror_ops[type]; r = hmm_mirror_register(&amn->mirror, mm); @@ -369,100 +229,33 @@ free_amn: * @bo: amdgpu buffer object * @addr: userptr addr we should monitor * - * Registers an HMM mirror for the given BO at the specified address. + * Registers a mmu_notifier for the given BO at the specified address. * Returns 0 on success, -ERRNO if anything goes wrong. */ int amdgpu_mn_register(struct amdgpu_bo *bo, unsigned long addr) { - unsigned long end = addr + amdgpu_bo_size(bo) - 1; - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - enum amdgpu_mn_type type = - bo->kfd_bo ? AMDGPU_MN_TYPE_HSA : AMDGPU_MN_TYPE_GFX; - struct amdgpu_mn *amn; - struct amdgpu_mn_node *node = NULL, *new_node; - struct list_head bos; - struct interval_tree_node *it; - - amn = amdgpu_mn_get(adev, type); - if (IS_ERR(amn)) - return PTR_ERR(amn); - - new_node = kmalloc(sizeof(*new_node), GFP_KERNEL); - if (!new_node) - return -ENOMEM; - - INIT_LIST_HEAD(&bos); - - down_write(&amn->lock); - - while ((it = interval_tree_iter_first(&amn->objects, addr, end))) { - kfree(node); - node = container_of(it, struct amdgpu_mn_node, it); - interval_tree_remove(&node->it, &amn->objects); - addr = min(it->start, addr); - end = max(it->last, end); - list_splice(&node->bos, &bos); - } - - if (!node) - node = new_node; + if (bo->kfd_bo) + bo->notifier.ops = &amdgpu_mn_hsa_ops; else - kfree(new_node); - - bo->mn = amn; - - node->it.start = addr; - node->it.last = end; - INIT_LIST_HEAD(&node->bos); - list_splice(&bos, &node->bos); - list_add(&bo->mn_list, &node->bos); + bo->notifier.ops = &amdgpu_mn_gfx_ops; - interval_tree_insert(&node->it, &amn->objects); - - up_write(&amn->lock); - - return 0; + return mmu_interval_notifier_insert(&bo->notifier, addr, + amdgpu_bo_size(bo), current->mm); } /** - * amdgpu_mn_unregister - unregister a BO for HMM mirror updates + * amdgpu_mn_unregister - unregister a BO for notifier updates * * @bo: amdgpu buffer object * - * Remove any registration of HMM mirror updates from the buffer object. + * Remove any registration of mmu notifier updates from the buffer object. */ void amdgpu_mn_unregister(struct amdgpu_bo *bo) { - struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - struct amdgpu_mn *amn; - struct list_head *head; - - mutex_lock(&adev->mn_lock); - - amn = bo->mn; - if (amn == NULL) { - mutex_unlock(&adev->mn_lock); + if (!bo->notifier.mm) return; - } - - down_write(&amn->lock); - - /* save the next list entry for later */ - head = bo->mn_list.next; - - bo->mn = NULL; - list_del_init(&bo->mn_list); - - if (list_empty(head)) { - struct amdgpu_mn_node *node; - - node = container_of(head, struct amdgpu_mn_node, bos); - interval_tree_remove(&node->it, &amn->objects); - kfree(node); - } - - up_write(&amn->lock); - mutex_unlock(&adev->mn_lock); + mmu_interval_notifier_remove(&bo->notifier); + bo->notifier.mm = NULL; } /* flags used by HMM internal, not related to CPU/GPU PTE flags */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h index b8ed68943625..d73ab2947b22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.h @@ -39,12 +39,10 @@ enum amdgpu_mn_type { * struct amdgpu_mn * * @adev: amdgpu device pointer - * @mm: process address space * @type: type of MMU notifier * @work: destruction work item * @node: hash table node to find structure by adev and mn * @lock: rw semaphore protecting the notifier nodes - * @objects: interval tree containing amdgpu_mn_nodes * @mirror: HMM mirror function support * * Data for each amdgpu device and process address space. @@ -52,7 +50,6 @@ enum amdgpu_mn_type { struct amdgpu_mn { /* constant after initialisation */ struct amdgpu_device *adev; - struct mm_struct *mm; enum amdgpu_mn_type type; /* only used on destruction */ @@ -63,7 +60,6 @@ struct amdgpu_mn { /* objects protected by lock */ struct rw_semaphore lock; - struct rb_root_cached objects; #ifdef CONFIG_HMM_MIRROR /* HMM mirror */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h index 658f4c9779b7..2792c5c70fd1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h @@ -30,6 +30,9 @@ #include #include "amdgpu.h" +#ifdef CONFIG_MMU_NOTIFIER +#include +#endif #define AMDGPU_BO_INVALID_OFFSET LONG_MAX #define AMDGPU_BO_MAX_PLACEMENTS 3 @@ -100,10 +103,12 @@ struct amdgpu_bo { struct ttm_bo_kmap_obj dma_buf_vmap; struct amdgpu_mn *mn; - union { - struct list_head mn_list; - struct list_head shadow_list; - }; + +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_interval_notifier notifier; +#endif + + struct list_head shadow_list; struct kgd_mem *kfd_bo; }; -- cgit v1.2.3