summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2025-08-27 12:45:45 +0300
committerAlex Deucher <alexander.deucher@amd.com>2025-09-18 23:59:14 +0300
commit59e4405e9ee2b318342d252422a82dd863b89ef4 (patch)
tree674721daa682adb6c53df107cb326d120cc46a46 /drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
parent0aa09d8a6cab0f7d284e61eff28ac3be4d324c20 (diff)
downloadlinux-59e4405e9ee2b318342d252422a82dd863b89ef4.tar.xz
drm/amdgpu: revert to old status lock handling v3
It turned out that protecting the status of each bo_va with a spinlock was just hiding problems instead of solving them. Revert the whole approach, add a separate stats_lock and lockdep assertions that the correct reservation lock is held all over the place. This not only allows for better checks if a state transition is properly protected by a lock, but also switching back to using list macros to iterate over the state of lists protected by the dma_resv lock of the root PD. v2: re-add missing check v3: split into two patches Signed-off-by: Christian König <christian.koenig@amd.com> Acked-by: Sunil Khatri <sunil.khatri@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c168
1 files changed, 80 insertions, 88 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 1ccd348bf48b..1f8b43253eea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -128,6 +128,17 @@ struct amdgpu_vm_tlb_seq_struct {
};
/**
+ * amdgpu_vm_assert_locked - check if VM is correctly locked
+ * @vm: the VM which schould be tested
+ *
+ * Asserts that the VM root PD is locked.
+ */
+static void amdgpu_vm_assert_locked(struct amdgpu_vm *vm)
+{
+ dma_resv_assert_held(vm->root.bo->tbo.base.resv);
+}
+
+/**
* amdgpu_vm_set_pasid - manage pasid and vm ptr mapping
*
* @adev: amdgpu_device pointer
@@ -143,6 +154,8 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm,
{
int r;
+ amdgpu_vm_assert_locked(vm);
+
if (vm->pasid == pasid)
return 0;
@@ -181,12 +194,11 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
struct amdgpu_bo *bo = vm_bo->bo;
vm_bo->moved = true;
- spin_lock(&vm_bo->vm->status_lock);
+ amdgpu_vm_assert_locked(vm);
if (bo->tbo.type == ttm_bo_type_kernel)
list_move(&vm_bo->vm_status, &vm->evicted);
else
list_move_tail(&vm_bo->vm_status, &vm->evicted);
- spin_unlock(&vm_bo->vm->status_lock);
}
/**
* amdgpu_vm_bo_moved - vm_bo is moved
@@ -198,9 +210,8 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
{
- spin_lock(&vm_bo->vm->status_lock);
+ amdgpu_vm_assert_locked(vm_bo->vm);
list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
- spin_unlock(&vm_bo->vm->status_lock);
}
/**
@@ -213,9 +224,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
{
- spin_lock(&vm_bo->vm->status_lock);
+ amdgpu_vm_assert_locked(vm_bo->vm);
list_move(&vm_bo->vm_status, &vm_bo->vm->idle);
- spin_unlock(&vm_bo->vm->status_lock);
vm_bo->moved = false;
}
@@ -229,9 +239,9 @@ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
{
- spin_lock(&vm_bo->vm->status_lock);
+ spin_lock(&vm_bo->vm->invalidated_lock);
list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated);
- spin_unlock(&vm_bo->vm->status_lock);
+ spin_unlock(&vm_bo->vm->invalidated_lock);
}
/**
@@ -244,10 +254,9 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo)
{
+ amdgpu_vm_assert_locked(vm_bo->vm);
vm_bo->moved = true;
- spin_lock(&vm_bo->vm->status_lock);
list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user);
- spin_unlock(&vm_bo->vm->status_lock);
}
/**
@@ -260,13 +269,11 @@ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
{
- if (vm_bo->bo->parent) {
- spin_lock(&vm_bo->vm->status_lock);
+ amdgpu_vm_assert_locked(vm_bo->vm);
+ if (vm_bo->bo->parent)
list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
- spin_unlock(&vm_bo->vm->status_lock);
- } else {
+ else
amdgpu_vm_bo_idle(vm_bo);
- }
}
/**
@@ -279,9 +286,8 @@ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
*/
static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
{
- spin_lock(&vm_bo->vm->status_lock);
+ amdgpu_vm_assert_locked(vm_bo->vm);
list_move(&vm_bo->vm_status, &vm_bo->vm->done);
- spin_unlock(&vm_bo->vm->status_lock);
}
/**
@@ -295,10 +301,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
{
struct amdgpu_vm_bo_base *vm_bo, *tmp;
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->invalidated_lock);
list_splice_init(&vm->done, &vm->invalidated);
list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
vm_bo->moved = true;
+ spin_unlock(&vm->invalidated_lock);
+
+ amdgpu_vm_assert_locked(vm_bo->vm);
list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
struct amdgpu_bo *bo = vm_bo->bo;
@@ -308,14 +317,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
else if (bo->parent)
list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
}
- spin_unlock(&vm->status_lock);
}
/**
* amdgpu_vm_update_shared - helper to update shared memory stat
* @base: base structure for tracking BO usage in a VM
*
- * Takes the vm status_lock and updates the shared memory stat. If the basic
+ * Takes the vm stats_lock and updates the shared memory stat. If the basic
* stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be called
* as well.
*/
@@ -327,7 +335,8 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base)
uint32_t bo_memtype = amdgpu_bo_mem_stats_placement(bo);
bool shared;
- spin_lock(&vm->status_lock);
+ dma_resv_assert_held(bo->tbo.base.resv);
+ spin_lock(&vm->stats_lock);
shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
if (base->shared != shared) {
base->shared = shared;
@@ -339,7 +348,7 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base)
vm->stats[bo_memtype].drm.private += size;
}
}
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->stats_lock);
}
/**
@@ -364,11 +373,11 @@ void amdgpu_vm_bo_update_shared(struct amdgpu_bo *bo)
* be bo->tbo.resource
* @sign: if we should add (+1) or subtract (-1) from the stat
*
- * Caller need to have the vm status_lock held. Useful for when multiple update
+ * Caller need to have the vm stats_lock held. Useful for when multiple update
* need to happen at the same time.
*/
static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base,
- struct ttm_resource *res, int sign)
+ struct ttm_resource *res, int sign)
{
struct amdgpu_vm *vm = base->vm;
struct amdgpu_bo *bo = base->bo;
@@ -392,7 +401,8 @@ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base,
*/
if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE)
vm->stats[res_memtype].drm.purgeable += size;
- if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(res_memtype)))
+ if (!(bo->preferred_domains &
+ amdgpu_mem_type_to_domain(res_memtype)))
vm->stats[bo_memtype].evicted += size;
}
}
@@ -411,9 +421,9 @@ void amdgpu_vm_update_stats(struct amdgpu_vm_bo_base *base,
{
struct amdgpu_vm *vm = base->vm;
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->stats_lock);
amdgpu_vm_update_stats_locked(base, res, sign);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->stats_lock);
}
/**
@@ -439,10 +449,10 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
base->next = bo->vm_bo;
bo->vm_bo = base;
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->stats_lock);
base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base);
amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->stats_lock);
if (!amdgpu_vm_is_bo_always_valid(vm, bo))
return;
@@ -501,10 +511,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
int ret;
/* We can only trust prev->next while holding the lock */
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->invalidated_lock);
while (!list_is_head(prev->next, &vm->done)) {
bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->invalidated_lock);
bo = bo_va->base.bo;
if (bo) {
@@ -512,10 +522,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec,
if (unlikely(ret))
return ret;
}
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->invalidated_lock);
prev = prev->next;
}
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->invalidated_lock);
return 0;
}
@@ -611,7 +621,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
void *param)
{
uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm);
- struct amdgpu_vm_bo_base *bo_base;
+ struct amdgpu_vm_bo_base *bo_base, *tmp;
struct amdgpu_bo *bo;
int r;
@@ -624,13 +634,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
return r;
}
- spin_lock(&vm->status_lock);
- while (!list_empty(&vm->evicted)) {
- bo_base = list_first_entry(&vm->evicted,
- struct amdgpu_vm_bo_base,
- vm_status);
- spin_unlock(&vm->status_lock);
-
+ list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) {
bo = bo_base->bo;
r = validate(param, bo);
@@ -643,26 +647,21 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm,
vm->update_funcs->map_table(to_amdgpu_bo_vm(bo));
amdgpu_vm_bo_relocated(bo_base);
}
- spin_lock(&vm->status_lock);
}
- while (ticket && !list_empty(&vm->evicted_user)) {
- bo_base = list_first_entry(&vm->evicted_user,
- struct amdgpu_vm_bo_base,
- vm_status);
- spin_unlock(&vm->status_lock);
- bo = bo_base->bo;
- dma_resv_assert_held(bo->tbo.base.resv);
-
- r = validate(param, bo);
- if (r)
- return r;
+ if (ticket) {
+ list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user,
+ vm_status) {
+ bo = bo_base->bo;
+ dma_resv_assert_held(bo->tbo.base.resv);
- amdgpu_vm_bo_invalidated(bo_base);
+ r = validate(param, bo);
+ if (r)
+ return r;
- spin_lock(&vm->status_lock);
+ amdgpu_vm_bo_invalidated(bo_base);
+ }
}
- spin_unlock(&vm->status_lock);
amdgpu_vm_eviction_lock(vm);
vm->evicting = false;
@@ -685,13 +684,13 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm)
{
bool ret;
+ amdgpu_vm_assert_locked(vm);
+
amdgpu_vm_eviction_lock(vm);
ret = !vm->evicting;
amdgpu_vm_eviction_unlock(vm);
- spin_lock(&vm->status_lock);
ret &= list_empty(&vm->evicted);
- spin_unlock(&vm->status_lock);
spin_lock(&vm->immediate.lock);
ret &= !vm->immediate.stopped;
@@ -982,16 +981,13 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
struct amdgpu_vm *vm, bool immediate)
{
struct amdgpu_vm_update_params params;
- struct amdgpu_vm_bo_base *entry;
+ struct amdgpu_vm_bo_base *entry, *tmp;
bool flush_tlb_needed = false;
- LIST_HEAD(relocated);
int r, idx;
- spin_lock(&vm->status_lock);
- list_splice_init(&vm->relocated, &relocated);
- spin_unlock(&vm->status_lock);
+ amdgpu_vm_assert_locked(vm);
- if (list_empty(&relocated))
+ if (list_empty(&vm->relocated))
return 0;
if (!drm_dev_enter(adev_to_drm(adev), &idx))
@@ -1006,7 +1002,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
if (r)
goto error;
- list_for_each_entry(entry, &relocated, vm_status) {
+ list_for_each_entry(entry, &vm->relocated, vm_status) {
/* vm_flush_needed after updating moved PDEs */
flush_tlb_needed |= entry->moved;
@@ -1022,9 +1018,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
if (flush_tlb_needed)
atomic64_inc(&vm->tlb_seq);
- while (!list_empty(&relocated)) {
- entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base,
- vm_status);
+ list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) {
amdgpu_vm_bo_idle(entry);
}
@@ -1250,9 +1244,9 @@ error_free:
void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM])
{
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->stats_lock);
memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->stats_lock);
}
/**
@@ -1619,29 +1613,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
struct amdgpu_vm *vm,
struct ww_acquire_ctx *ticket)
{
- struct amdgpu_bo_va *bo_va;
+ struct amdgpu_bo_va *bo_va, *tmp;
struct dma_resv *resv;
bool clear, unlock;
int r;
- spin_lock(&vm->status_lock);
- while (!list_empty(&vm->moved)) {
- bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
- base.vm_status);
- spin_unlock(&vm->status_lock);
-
+ list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
/* Per VM BOs never need to bo cleared in the page tables */
r = amdgpu_vm_bo_update(adev, bo_va, false);
if (r)
return r;
- spin_lock(&vm->status_lock);
}
+ spin_lock(&vm->invalidated_lock);
while (!list_empty(&vm->invalidated)) {
bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
base.vm_status);
resv = bo_va->base.bo->tbo.base.resv;
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->invalidated_lock);
/* Try to reserve the BO to avoid clearing its ptes */
if (!adev->debug_vm && dma_resv_trylock(resv)) {
@@ -1673,9 +1662,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
amdgpu_vm_bo_evicted_user(&bo_va->base);
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->invalidated_lock);
}
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->invalidated_lock);
return 0;
}
@@ -2204,9 +2193,9 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev,
}
}
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->invalidated_lock);
list_del(&bo_va->base.vm_status);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->invalidated_lock);
list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
list_del(&mapping->list);
@@ -2314,10 +2303,10 @@ void amdgpu_vm_bo_move(struct amdgpu_bo *bo, struct ttm_resource *new_mem,
for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
struct amdgpu_vm *vm = bo_base->vm;
- spin_lock(&vm->status_lock);
+ spin_lock(&vm->stats_lock);
amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1);
amdgpu_vm_update_stats_locked(bo_base, new_mem, +1);
- spin_unlock(&vm->status_lock);
+ spin_unlock(&vm->stats_lock);
}
amdgpu_vm_bo_invalidate(bo, evicted);
@@ -2584,11 +2573,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
INIT_LIST_HEAD(&vm->relocated);
INIT_LIST_HEAD(&vm->moved);
INIT_LIST_HEAD(&vm->idle);
+ spin_lock_init(&vm->invalidated_lock);
INIT_LIST_HEAD(&vm->invalidated);
- spin_lock_init(&vm->status_lock);
INIT_LIST_HEAD(&vm->freed);
INIT_LIST_HEAD(&vm->done);
INIT_KFIFO(vm->faults);
+ spin_lock_init(&vm->stats_lock);
r = amdgpu_vm_init_entities(adev, vm);
if (r)
@@ -3053,7 +3043,8 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
unsigned int total_done_objs = 0;
unsigned int id = 0;
- spin_lock(&vm->status_lock);
+ amdgpu_vm_assert_locked(vm);
+
seq_puts(m, "\tIdle BOs:\n");
list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
if (!bo_va->base.bo)
@@ -3091,11 +3082,13 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
id = 0;
seq_puts(m, "\tInvalidated BOs:\n");
+ spin_lock(&vm->invalidated_lock);
list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
if (!bo_va->base.bo)
continue;
total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, m);
}
+ spin_unlock(&vm->invalidated_lock);
total_invalidated_objs = id;
id = 0;
@@ -3105,7 +3098,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
continue;
total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m);
}
- spin_unlock(&vm->status_lock);
total_done_objs = id;
seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle,