summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2025-08-27 10:28:40 +0300
committerAlex Deucher <alexander.deucher@amd.com>2025-09-17 00:47:06 +0300
commit39203f5e6dcf2b0529ae526004e7a9c8ef453ae2 (patch)
tree5b6716a09aefeb0a2ae603692d742b61e52fc5ad /drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
parentd7ddcf921e7d0d8ebe82e89635bc9dc26ba9540d (diff)
downloadlinux-39203f5e6dcf2b0529ae526004e7a9c8ef453ae2.tar.xz
drm/amdgpu: fix userq VM validation v4
That was actually complete nonsense and not validating the BOs at all. The code just cleared all VM areas were it couldn't grab the lock for a BO. Try to fix this. Only compile tested at the moment. v2: fix fence slot reservation as well as pointed out by Sunil. also validate PDs, PTs, per VM BOs and update PDEs v3: grab the status_lock while working with the done list. v4: rename functions, add some comments, fix waiting for updates to complete. v4: rename amdgpu_vm_lock_done_list(), add some more comments Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Sunil Khatri <sunil.khatri@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c148
1 files changed, 73 insertions, 75 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index d2e59aecb3eb..3dfe7f5f154b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -706,108 +706,106 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo)
+{
+ struct ttm_operation_ctx ctx = { false, false };
+
+ amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+ return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+}
+
+/* Handle all BOs on the invalidated list, validate them and update the PTs */
static int
-amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)
+amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec,
+ struct amdgpu_vm *vm)
{
struct ttm_operation_ctx ctx = { false, false };
+ struct amdgpu_bo_va *bo_va;
+ struct amdgpu_bo *bo;
int ret;
- amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+ spin_lock(&vm->status_lock);
+ while (!list_empty(&vm->invalidated)) {
+ bo_va = list_first_entry(&vm->invalidated,
+ struct amdgpu_bo_va,
+ base.vm_status);
+ spin_unlock(&vm->status_lock);
- ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
- if (ret)
- DRM_ERROR("Fail to validate\n");
+ bo = bo_va->base.bo;
+ ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2);
+ if (unlikely(ret))
+ return ret;
- return ret;
+ amdgpu_bo_placement_from_domain(bo, bo->allowed_domains);
+ ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
+ if (ret)
+ return ret;
+
+ /* This moves the bo_va to the done list */
+ ret = amdgpu_vm_bo_update(adev, bo_va, false);
+ if (ret)
+ return ret;
+
+ spin_lock(&vm->status_lock);
+ }
+ spin_unlock(&vm->status_lock);
+
+ return 0;
}
+/* Make sure the whole VM is ready to be used */
static int
-amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
+amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr)
{
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
- struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_device *adev = uq_mgr->adev;
+ struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_va *bo_va;
- struct ww_acquire_ctx *ticket;
struct drm_exec exec;
- struct amdgpu_bo *bo;
- struct dma_resv *resv;
- bool clear, unlock;
- int ret = 0;
+ int ret;
drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
drm_exec_until_all_locked(&exec) {
- ret = amdgpu_vm_lock_pd(vm, &exec, 2);
+ ret = amdgpu_vm_lock_pd(vm, &exec, 1);
drm_exec_retry_on_contention(&exec);
- if (unlikely(ret)) {
- drm_file_err(uq_mgr->file, "Failed to lock PD\n");
+ if (unlikely(ret))
goto unlock_all;
- }
-
- /* Lock the done list */
- list_for_each_entry(bo_va, &vm->done, base.vm_status) {
- bo = bo_va->base.bo;
- if (!bo)
- continue;
- ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
- drm_exec_retry_on_contention(&exec);
- if (unlikely(ret))
- goto unlock_all;
- }
- }
-
- spin_lock(&vm->status_lock);
- while (!list_empty(&vm->moved)) {
- bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
- base.vm_status);
- spin_unlock(&vm->status_lock);
-
- /* Per VM BOs never need to bo cleared in the page tables */
- ret = amdgpu_vm_bo_update(adev, bo_va, false);
- if (ret)
+ ret = amdgpu_vm_lock_done_list(vm, &exec, 1);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret))
goto unlock_all;
- spin_lock(&vm->status_lock);
- }
-
- ticket = &exec.ticket;
- while (!list_empty(&vm->invalidated)) {
- bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
- base.vm_status);
- resv = bo_va->base.bo->tbo.base.resv;
- spin_unlock(&vm->status_lock);
- bo = bo_va->base.bo;
- ret = amdgpu_userq_validate_vm_bo(NULL, bo);
- if (ret) {
- drm_file_err(uq_mgr->file, "Failed to validate BO\n");
+ /* This validates PDs, PTs and per VM BOs */
+ ret = amdgpu_vm_validate(adev, vm, NULL,
+ amdgpu_userq_validate_vm,
+ NULL);
+ if (unlikely(ret))
goto unlock_all;
- }
- /* Try to reserve the BO to avoid clearing its ptes */
- if (!adev->debug_vm && dma_resv_trylock(resv)) {
- clear = false;
- unlock = true;
- /* The caller is already holding the reservation lock */
- } else if (dma_resv_locking_ctx(resv) == ticket) {
- clear = false;
- unlock = false;
- /* Somebody else is using the BO right now */
- } else {
- clear = true;
- unlock = false;
- }
+ /* This locks and validates the remaining evicted BOs */
+ ret = amdgpu_userq_bo_validate(adev, &exec, vm);
+ drm_exec_retry_on_contention(&exec);
+ if (unlikely(ret))
+ goto unlock_all;
+ }
- ret = amdgpu_vm_bo_update(adev, bo_va, clear);
+ ret = amdgpu_vm_handle_moved(adev, vm, NULL);
+ if (ret)
+ goto unlock_all;
- if (unlock)
- dma_resv_unlock(resv);
- if (ret)
- goto unlock_all;
+ ret = amdgpu_vm_update_pdes(adev, vm, false);
+ if (ret)
+ goto unlock_all;
- spin_lock(&vm->status_lock);
- }
- spin_unlock(&vm->status_lock);
+ /*
+ * We need to wait for all VM updates to finish before restarting the
+ * queues. Using the done list like that is now ok since everything is
+ * locked in place.
+ */
+ list_for_each_entry(bo_va, &vm->done, base.vm_status)
+ dma_fence_wait(bo_va->last_pt_update, false);
+ dma_fence_wait(vm->last_update, false);
ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
if (ret)
@@ -828,7 +826,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work)
mutex_lock(&uq_mgr->userq_mutex);
- ret = amdgpu_userq_validate_bos(uq_mgr);
+ ret = amdgpu_userq_vm_validate(uq_mgr);
if (ret) {
drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n");
goto unlock;