diff options
author | Dennis Li <Dennis.Li@amd.com> | 2020-07-08 10:07:13 +0300 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2020-07-27 23:21:37 +0300 |
commit | df9c8d1aa278c435c30a69b8f2418b4a52fcb929 (patch) | |
tree | bb486ddef62e0179b0b97b0ba93a5fa9fcb0a207 /drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | |
parent | c5079f35c0ec315c944398e687821c1ceb6c6d4f (diff) | |
download | linux-df9c8d1aa278c435c30a69b8f2418b4a52fcb929.tar.xz |
drm/amdgpu: fix system hang issue during GPU reset
when GPU hang, driver has multi-paths to enter amdgpu_device_gpu_recover,
the atomic adev->in_gpu_reset and hive->in_reset are used to avoid
re-entering GPU recovery.
During GPU reset and resume, it is unsafe that other threads access GPU,
which maybe cause GPU reset failed. Therefore the new rw_semaphore
adev->reset_sem is introduced, which protect GPU from being accessed by
external threads during recovery.
v2:
1. add rwlock for some ioctls, debugfs and file-close function.
2. change to use dqm->is_resetting and dqm_lock for protection in kfd
driver.
3. remove try_lock and change adev->in_gpu_reset as atomic, to avoid
re-enter GPU recovery for the same GPU hang.
v3:
1. change back to use adev->reset_sem to protect kfd callback
functions, because dqm_lock couldn't protect all codes, for example:
free_mqd must be called outside of dqm_lock;
[ 1230.176199] Hardware name: Supermicro SYS-7049GP-TRT/X11DPG-QT, BIOS 3.1 05/23/2019
[ 1230.177221] Call Trace:
[ 1230.178249] dump_stack+0x98/0xd5
[ 1230.179443] amdgpu_virt_kiq_reg_write_reg_wait+0x181/0x190 [amdgpu]
[ 1230.180673] gmc_v9_0_flush_gpu_tlb+0xcc/0x310 [amdgpu]
[ 1230.181882] amdgpu_gart_unbind+0xa9/0xe0 [amdgpu]
[ 1230.183098] amdgpu_ttm_backend_unbind+0x46/0x180 [amdgpu]
[ 1230.184239] ? ttm_bo_put+0x171/0x5f0 [ttm]
[ 1230.185394] ttm_tt_unbind+0x21/0x40 [ttm]
[ 1230.186558] ttm_tt_destroy.part.12+0x12/0x60 [ttm]
[ 1230.187707] ttm_tt_destroy+0x13/0x20 [ttm]
[ 1230.188832] ttm_bo_cleanup_memtype_use+0x36/0x80 [ttm]
[ 1230.189979] ttm_bo_put+0x1be/0x5f0 [ttm]
[ 1230.191230] amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 1230.192522] amdgpu_amdkfd_free_gtt_mem+0xaf/0x140 [amdgpu]
[ 1230.193833] free_mqd+0x25/0x40 [amdgpu]
[ 1230.195143] destroy_queue_cpsch+0x1a7/0x270 [amdgpu]
[ 1230.196475] pqm_destroy_queue+0x105/0x260 [amdgpu]
[ 1230.197819] kfd_ioctl_destroy_queue+0x37/0x70 [amdgpu]
[ 1230.199154] kfd_ioctl+0x277/0x500 [amdgpu]
[ 1230.200458] ? kfd_ioctl_get_clock_counters+0x60/0x60 [amdgpu]
[ 1230.201656] ? tomoyo_file_ioctl+0x19/0x20
[ 1230.202831] ksys_ioctl+0x98/0xb0
[ 1230.204004] __x64_sys_ioctl+0x1a/0x20
[ 1230.205174] do_syscall_64+0x5f/0x250
[ 1230.206339] entry_SYSCALL_64_after_hwframe+0x49/0xbe
2. remove try_lock and introduce atomic hive->in_reset, to avoid
re-enter GPU recovery.
v4:
1. remove an unnecessary whitespace change in kfd_chardev.c
2. remove comment codes in amdgpu_device.c
3. add more detailed comment in commit message
4. define a wrap function amdgpu_in_reset
v5:
1. Fix some style issues.
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Suggested-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Felix Kuehling <Felix.Kuehling@amd.com>
Suggested-by: Lijo Lazar <Lijo.Lazar@amd.com>
Suggested-by: Luben Tukov <luben.tuikov@amd.com>
Signed-off-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 40 |
1 files changed, 37 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 1b865fed74ca..a0ea663ecdbc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -244,11 +244,14 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, if (cp_mqd_gfx9) bp.flags |= AMDGPU_GEM_CREATE_CP_MQD_GFX9; + if (!down_read_trylock(&adev->reset_sem)) + return -EIO; + r = amdgpu_bo_create(adev, &bp, &bo); if (r) { dev_err(adev->dev, "failed to allocate BO for amdkfd (%d)\n", r); - return r; + goto err; } /* map the buffer */ @@ -283,6 +286,7 @@ int amdgpu_amdkfd_alloc_gtt_mem(struct kgd_dev *kgd, size_t size, amdgpu_bo_unreserve(bo); + up_read(&adev->reset_sem); return 0; allocate_mem_kmap_bo_failed: @@ -291,19 +295,25 @@ allocate_mem_pin_bo_failed: amdgpu_bo_unreserve(bo); allocate_mem_reserve_bo_failed: amdgpu_bo_unref(&bo); - +err: + up_read(&adev->reset_sem); return r; } void amdgpu_amdkfd_free_gtt_mem(struct kgd_dev *kgd, void *mem_obj) { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_bo *bo = (struct amdgpu_bo *) mem_obj; + down_read(&adev->reset_sem); + amdgpu_bo_reserve(bo, true); amdgpu_bo_kunmap(bo); amdgpu_bo_unpin(bo); amdgpu_bo_unreserve(bo); amdgpu_bo_unref(&(bo)); + + up_read(&adev->reset_sem); } int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, @@ -335,9 +345,14 @@ int amdgpu_amdkfd_alloc_gws(struct kgd_dev *kgd, size_t size, void amdgpu_amdkfd_free_gws(struct kgd_dev *kgd, void *mem_obj) { + struct amdgpu_device *adev = (struct amdgpu_device *)kgd; struct amdgpu_bo *bo = (struct amdgpu_bo *)mem_obj; + down_read(&adev->reset_sem); + amdgpu_bo_unref(&bo); + + up_read(&adev->reset_sem); } uint32_t amdgpu_amdkfd_get_fw_version(struct kgd_dev *kgd, @@ -611,12 +626,19 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine, /* This works for NO_HWS. TODO: need to handle without knowing VMID */ job->vmid = vmid; + if (!down_read_trylock(&adev->reset_sem)) { + ret = -EIO; + goto err_ib_sched; + } + ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); if (ret) { DRM_ERROR("amdgpu: failed to schedule IB.\n"); goto err_ib_sched; } + up_read(&adev->reset_sem); + ret = dma_fence_wait(f, false); err_ib_sched: @@ -647,6 +669,9 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) { struct amdgpu_device *adev = (struct amdgpu_device *)kgd; + if (!down_read_trylock(&adev->reset_sem)) + return -EIO; + if (adev->family == AMDGPU_FAMILY_AI) { int i; @@ -656,6 +681,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid) amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0); } + up_read(&adev->reset_sem); + return 0; } @@ -664,11 +691,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid) struct amdgpu_device *adev = (struct amdgpu_device *)kgd; const uint32_t flush_type = 0; bool all_hub = false; + int ret = -EIO; if (adev->family == AMDGPU_FAMILY_AI) all_hub = true; - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub); + if (down_read_trylock(&adev->reset_sem)) { + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, + pasid, flush_type, all_hub); + up_read(&adev->reset_sem); + } + + return ret; } bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) |