diff options
| author | Christian König <christian.koenig@amd.com> | 2025-08-27 12:45:45 +0300 |
|---|---|---|
| committer | Alex Deucher <alexander.deucher@amd.com> | 2025-09-18 23:59:14 +0300 |
| commit | 59e4405e9ee2b318342d252422a82dd863b89ef4 (patch) | |
| tree | 674721daa682adb6c53df107cb326d120cc46a46 /drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | |
| parent | 0aa09d8a6cab0f7d284e61eff28ac3be4d324c20 (diff) | |
| download | linux-59e4405e9ee2b318342d252422a82dd863b89ef4.tar.xz | |
drm/amdgpu: revert to old status lock handling v3
It turned out that protecting the status of each bo_va with a
spinlock was just hiding problems instead of solving them.
Revert the whole approach, add a separate stats_lock and lockdep
assertions that the correct reservation lock is held all over the place.
This not only allows for better checks if a state transition is properly
protected by a lock, but also switching back to using list macros to
iterate over the state of lists protected by the dma_resv lock of the
root PD.
v2: re-add missing check
v3: split into two patches
Signed-off-by: Christian König <christian.koenig@amd.com>
Acked-by: Sunil Khatri <sunil.khatri@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 168 |
1 files changed, 80 insertions, 88 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1ccd348bf48b..1f8b43253eea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -128,6 +128,17 @@ struct amdgpu_vm_tlb_seq_struct { }; /** + * amdgpu_vm_assert_locked - check if VM is correctly locked + * @vm: the VM which schould be tested + * + * Asserts that the VM root PD is locked. + */ +static void amdgpu_vm_assert_locked(struct amdgpu_vm *vm) +{ + dma_resv_assert_held(vm->root.bo->tbo.base.resv); +} + +/** * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping * * @adev: amdgpu_device pointer @@ -143,6 +154,8 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, { int r; + amdgpu_vm_assert_locked(vm); + if (vm->pasid == pasid) return 0; @@ -181,12 +194,11 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) struct amdgpu_bo *bo = vm_bo->bo; vm_bo->moved = true; - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm); if (bo->tbo.type == ttm_bo_type_kernel) list_move(&vm_bo->vm_status, &vm->evicted); else list_move_tail(&vm_bo->vm_status, &vm->evicted); - spin_unlock(&vm_bo->vm->status_lock); } /** * amdgpu_vm_bo_moved - vm_bo is moved @@ -198,9 +210,8 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->moved); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -213,9 +224,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->idle); - spin_unlock(&vm_bo->vm->status_lock); vm_bo->moved = false; } @@ -229,9 +239,9 @@ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + spin_lock(&vm_bo->vm->invalidated_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated); - spin_unlock(&vm_bo->vm->status_lock); + spin_unlock(&vm_bo->vm->invalidated_lock); } /** @@ -244,10 +254,9 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) { + amdgpu_vm_assert_locked(vm_bo->vm); vm_bo->moved = true; - spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -260,13 +269,11 @@ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) { - if (vm_bo->bo->parent) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); + if (vm_bo->bo->parent) list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); - spin_unlock(&vm_bo->vm->status_lock); - } else { + else amdgpu_vm_bo_idle(vm_bo); - } } /** @@ -279,9 +286,8 @@ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->done); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -295,10 +301,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) { struct amdgpu_vm_bo_base *vm_bo, *tmp; - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); list_splice_init(&vm->done, &vm->invalidated); list_for_each_entry(vm_bo, &vm->invalidated, vm_status) vm_bo->moved = true; + spin_unlock(&vm->invalidated_lock); + + amdgpu_vm_assert_locked(vm_bo->vm); list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { struct amdgpu_bo *bo = vm_bo->bo; @@ -308,14 +317,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) else if (bo->parent) list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); } - spin_unlock(&vm->status_lock); } /** * amdgpu_vm_update_shared - helper to update shared memory stat * @base: base structure for tracking BO usage in a VM * - * Takes the vm status_lock and updates the shared memory stat. If the basic + * Takes the vm stats_lock and updates the shared memory stat. If the basic * stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be called * as well. */ @@ -327,7 +335,8 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) uint32_t bo_memtype = amdgpu_bo_mem_stats_placement(bo); bool shared; - spin_lock(&vm->status_lock); + dma_resv_assert_held(bo->tbo.base.resv); + spin_lock(&vm->stats_lock); shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); if (base->shared != shared) { base->shared = shared; @@ -339,7 +348,7 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) vm->stats[bo_memtype].drm.private += size; } } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -364,11 +373,11 @@ void amdgpu_vm_bo_update_shared(struct amdgpu_bo *bo) * be bo->tbo.resource * @sign: if we should add (+1) or subtract (-1) from the stat * - * Caller need to have the vm status_lock held. Useful for when multiple update + * Caller need to have the vm stats_lock held. Useful for when multiple update * need to happen at the same time. */ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, - struct ttm_resource *res, int sign) + struct ttm_resource *res, int sign) { struct amdgpu_vm *vm = base->vm; struct amdgpu_bo *bo = base->bo; @@ -392,7 +401,8 @@ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, */ if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE) vm->stats[res_memtype].drm.purgeable += size; - if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(res_memtype))) + if (!(bo->preferred_domains & + amdgpu_mem_type_to_domain(res_memtype))) vm->stats[bo_memtype].evicted += size; } } @@ -411,9 +421,9 @@ void amdgpu_vm_update_stats(struct amdgpu_vm_bo_base *base, { struct amdgpu_vm *vm = base->vm; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); amdgpu_vm_update_stats_locked(base, res, sign); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -439,10 +449,10 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, base->next = bo->vm_bo; bo->vm_bo = base; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); if (!amdgpu_vm_is_bo_always_valid(vm, bo)) return; @@ -501,10 +511,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, int ret; /* We can only trust prev->next while holding the lock */ - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); while (!list_is_head(prev->next, &vm->done)) { bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); bo = bo_va->base.bo; if (bo) { @@ -512,10 +522,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, if (unlikely(ret)) return ret; } - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); prev = prev->next; } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); return 0; } @@ -611,7 +621,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, void *param) { uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); - struct amdgpu_vm_bo_base *bo_base; + struct amdgpu_vm_bo_base *bo_base, *tmp; struct amdgpu_bo *bo; int r; @@ -624,13 +634,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, return r; } - spin_lock(&vm->status_lock); - while (!list_empty(&vm->evicted)) { - bo_base = list_first_entry(&vm->evicted, - struct amdgpu_vm_bo_base, - vm_status); - spin_unlock(&vm->status_lock); - + list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { bo = bo_base->bo; r = validate(param, bo); @@ -643,26 +647,21 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->update_funcs->map_table(to_amdgpu_bo_vm(bo)); amdgpu_vm_bo_relocated(bo_base); } - spin_lock(&vm->status_lock); } - while (ticket && !list_empty(&vm->evicted_user)) { - bo_base = list_first_entry(&vm->evicted_user, - struct amdgpu_vm_bo_base, - vm_status); - spin_unlock(&vm->status_lock); - bo = bo_base->bo; - dma_resv_assert_held(bo->tbo.base.resv); - - r = validate(param, bo); - if (r) - return r; + if (ticket) { + list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user, + vm_status) { + bo = bo_base->bo; + dma_resv_assert_held(bo->tbo.base.resv); - amdgpu_vm_bo_invalidated(bo_base); + r = validate(param, bo); + if (r) + return r; - spin_lock(&vm->status_lock); + amdgpu_vm_bo_invalidated(bo_base); + } } - spin_unlock(&vm->status_lock); amdgpu_vm_eviction_lock(vm); vm->evicting = false; @@ -685,13 +684,13 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { bool ret; + amdgpu_vm_assert_locked(vm); + amdgpu_vm_eviction_lock(vm); ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); - spin_lock(&vm->status_lock); ret &= list_empty(&vm->evicted); - spin_unlock(&vm->status_lock); spin_lock(&vm->immediate.lock); ret &= !vm->immediate.stopped; @@ -982,16 +981,13 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, struct amdgpu_vm *vm, bool immediate) { struct amdgpu_vm_update_params params; - struct amdgpu_vm_bo_base *entry; + struct amdgpu_vm_bo_base *entry, *tmp; bool flush_tlb_needed = false; - LIST_HEAD(relocated); int r, idx; - spin_lock(&vm->status_lock); - list_splice_init(&vm->relocated, &relocated); - spin_unlock(&vm->status_lock); + amdgpu_vm_assert_locked(vm); - if (list_empty(&relocated)) + if (list_empty(&vm->relocated)) return 0; if (!drm_dev_enter(adev_to_drm(adev), &idx)) @@ -1006,7 +1002,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (r) goto error; - list_for_each_entry(entry, &relocated, vm_status) { + list_for_each_entry(entry, &vm->relocated, vm_status) { /* vm_flush_needed after updating moved PDEs */ flush_tlb_needed |= entry->moved; @@ -1022,9 +1018,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (flush_tlb_needed) atomic64_inc(&vm->tlb_seq); - while (!list_empty(&relocated)) { - entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base, - vm_status); + list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) { amdgpu_vm_bo_idle(entry); } @@ -1250,9 +1244,9 @@ error_free: void amdgpu_vm_get_memory(struct amdgpu_vm *vm, struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]) { - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -1619,29 +1613,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) { - struct amdgpu_bo_va *bo_va; + struct amdgpu_bo_va *bo_va, *tmp; struct dma_resv *resv; bool clear, unlock; int r; - spin_lock(&vm->status_lock); - while (!list_empty(&vm->moved)) { - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, - base.vm_status); - spin_unlock(&vm->status_lock); - + list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) { /* Per VM BOs never need to bo cleared in the page tables */ r = amdgpu_vm_bo_update(adev, bo_va, false); if (r) return r; - spin_lock(&vm->status_lock); } + spin_lock(&vm->invalidated_lock); while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); resv = bo_va->base.bo->tbo.base.resv; - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); /* Try to reserve the BO to avoid clearing its ptes */ if (!adev->debug_vm && dma_resv_trylock(resv)) { @@ -1673,9 +1662,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) amdgpu_vm_bo_evicted_user(&bo_va->base); - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); return 0; } @@ -2204,9 +2193,9 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, } } - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); list_del(&bo_va->base.vm_status); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { list_del(&mapping->list); @@ -2314,10 +2303,10 @@ void amdgpu_vm_bo_move(struct amdgpu_bo *bo, struct ttm_resource *new_mem, for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { struct amdgpu_vm *vm = bo_base->vm; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1); amdgpu_vm_update_stats_locked(bo_base, new_mem, +1); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } amdgpu_vm_bo_invalidate(bo, evicted); @@ -2584,11 +2573,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_LIST_HEAD(&vm->relocated); INIT_LIST_HEAD(&vm->moved); INIT_LIST_HEAD(&vm->idle); + spin_lock_init(&vm->invalidated_lock); INIT_LIST_HEAD(&vm->invalidated); - spin_lock_init(&vm->status_lock); INIT_LIST_HEAD(&vm->freed); INIT_LIST_HEAD(&vm->done); INIT_KFIFO(vm->faults); + spin_lock_init(&vm->stats_lock); r = amdgpu_vm_init_entities(adev, vm); if (r) @@ -3053,7 +3043,8 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) unsigned int total_done_objs = 0; unsigned int id = 0; - spin_lock(&vm->status_lock); + amdgpu_vm_assert_locked(vm); + seq_puts(m, "\tIdle BOs:\n"); list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { if (!bo_va->base.bo) @@ -3091,11 +3082,13 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) id = 0; seq_puts(m, "\tInvalidated BOs:\n"); + spin_lock(&vm->invalidated_lock); list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) { if (!bo_va->base.bo) continue; total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } + spin_unlock(&vm->invalidated_lock); total_invalidated_objs = id; id = 0; @@ -3105,7 +3098,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) continue; total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } - spin_unlock(&vm->status_lock); total_done_objs = id; seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, |
