From 6716a823d18d422c356c2cd0c087f46d84c9e713 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 6 Jun 2025 14:13:37 +0200 Subject: drm/amdgpu: rework how PTE flags are generated v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously we tried to keep the HW specific PTE flags in each mapping, but for CRIU that isn't sufficient any more since the original value is needed for the checkpoint procedure. So rework the whole handling, nuke the early mapping function, keep the UAPI flags in each mapping instead of the HW flags and translate them to the HW flags while filling in the PTEs. Only tested on Navi 23 for now, so probably needs quite a bit of more work. v2: fix KFD and SVN handling v3: one more SVN fix pointed out by Felix v4: squash in gfx12 fix from David Signed-off-by: Christian König Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5cacf5717016..39b4250ede0f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1332,13 +1332,14 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, /* normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here * but in case of something, we filter the flags in first place */ - if (!(mapping->flags & AMDGPU_PTE_READABLE)) + if (!(mapping->flags & AMDGPU_VM_PAGE_READABLE)) update_flags &= ~AMDGPU_PTE_READABLE; - if (!(mapping->flags & AMDGPU_PTE_WRITEABLE)) + if (!(mapping->flags & AMDGPU_VM_PAGE_WRITEABLE)) update_flags &= ~AMDGPU_PTE_WRITEABLE; /* Apply ASIC specific mapping flags */ - amdgpu_gmc_get_vm_pte(adev, mapping, &update_flags); + amdgpu_gmc_get_vm_pte(adev, vm, bo, mapping->flags, + &update_flags); trace_amdgpu_vm_bo_update(mapping); @@ -1479,7 +1480,7 @@ static void amdgpu_vm_free_mapping(struct amdgpu_device *adev, struct amdgpu_bo_va_mapping *mapping, struct dma_fence *fence) { - if (mapping->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (mapping->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_add_prt_cb(adev, fence); kfree(mapping); } @@ -1758,7 +1759,7 @@ static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev, list_add(&mapping->list, &bo_va->invalids); amdgpu_vm_it_insert(mapping, &vm->va); - if (mapping->flags & AMDGPU_PTE_PRT_FLAG(adev)) + if (mapping->flags & AMDGPU_VM_PAGE_PRT) amdgpu_vm_prt_get(adev); if (amdgpu_vm_is_bo_always_valid(vm, bo) && !bo_va->base.moved) @@ -1818,7 +1819,7 @@ static int amdgpu_vm_verify_parameters(struct amdgpu_device *adev, int amdgpu_vm_bo_map(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, uint64_t saddr, uint64_t offset, - uint64_t size, uint64_t flags) + uint64_t size, uint32_t flags) { struct amdgpu_bo_va_mapping *mapping, *tmp; struct amdgpu_bo *bo = bo_va->base.bo; @@ -1877,7 +1878,7 @@ int amdgpu_vm_bo_map(struct amdgpu_device *adev, int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, uint64_t saddr, uint64_t offset, - uint64_t size, uint64_t flags) + uint64_t size, uint32_t flags) { struct amdgpu_bo_va_mapping *mapping; struct amdgpu_bo *bo = bo_va->base.bo; @@ -2734,7 +2735,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) dma_fence_put(vm->last_tlb_flush); list_for_each_entry_safe(mapping, tmp, &vm->freed, list) { - if (mapping->flags & AMDGPU_PTE_PRT_FLAG(adev) && prt_fini_needed) { + if (mapping->flags & AMDGPU_VM_PAGE_PRT && prt_fini_needed) { amdgpu_vm_prt_fini(adev, vm); prt_fini_needed = false; } -- cgit v1.2.3 From f101c13a8720c73e67f8f9d511fbbeda95bcedb1 Mon Sep 17 00:00:00 2001 From: Liu01 Tong Date: Mon, 11 Aug 2025 14:52:37 +0800 Subject: drm/amdgpu: fix task hang from failed job submission during process kill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During process kill, drm_sched_entity_flush() will kill the vm entities. The following job submissions of this process will fail, and the resources of these jobs have not been released, nor have the fences been signalled, causing tasks to hang and timeout. Fix by check entity status in amdgpu_vm_ready() and avoid submit jobs to stopped entity. v2: add amdgpu_vm_ready() check before amdgpu_vm_clear_freed() in function amdgpu_cs_vm_handling(). Signed-off-by: Liu01 Tong Signed-off-by: Lin.Cao Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 27f8f316f6c2..2ac9729e4c86 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -1139,6 +1139,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) } } + if (!amdgpu_vm_ready(vm)) + return -EINVAL; + r = amdgpu_vm_clear_freed(adev, vm, NULL); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 39b4250ede0f..bd12d8ff15a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -654,11 +654,10 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, * Check if all VM PDs/PTs are ready for updates * * Returns: - * True if VM is not evicting. + * True if VM is not evicting and all VM entities are not stopped */ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { - bool empty; bool ret; amdgpu_vm_eviction_lock(vm); @@ -666,10 +665,18 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) amdgpu_vm_eviction_unlock(vm); spin_lock(&vm->status_lock); - empty = list_empty(&vm->evicted); + ret &= list_empty(&vm->evicted); spin_unlock(&vm->status_lock); - return ret && empty; + spin_lock(&vm->immediate.lock); + ret &= !vm->immediate.stopped; + spin_unlock(&vm->immediate.lock); + + spin_lock(&vm->delayed.lock); + ret &= !vm->delayed.stopped; + spin_unlock(&vm->delayed.lock); + + return ret; } /** -- cgit v1.2.3 From 3eb61d7cb74cea2ea697363669fa256937164758 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 15 Jul 2025 10:26:22 +0200 Subject: Revert "drm/amdgpu: Use dma_buf from GEM object instance" This reverts commit 515986100d176663d0a03219a3056e4252f729e6. The dma_buf field in struct drm_gem_object is not stable over the object instance's lifetime. The field becomes NULL when user space releases the final GEM handle on the buffer object. This resulted in a NULL-pointer deref. Workarounds in commit 5307dce878d4 ("drm/gem: Acquire references on GEM handles for framebuffers") and commit f6bfc9afc751 ("drm/framebuffer: Acquire internal references on GEM handles") only solved the problem partially. They especially don't work for buffer objects without a DRM framebuffer associated. Hence, this revert to going back to using .import_attach->dmabuf. Signed-off-by: Thomas Zimmermann Reviewed-by: Simona Vetter Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250715082635.34974-1-tzimmermann@suse.de --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index ff98c87b2e0b..5743ebb2f1b7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -514,7 +514,7 @@ bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev, return false; if (drm_gem_is_imported(obj)) { - struct dma_buf *dma_buf = obj->dma_buf; + struct dma_buf *dma_buf = obj->import_attach->dmabuf; if (dma_buf->ops != &amdgpu_dmabuf_ops) /* No XGMI with non AMD GPUs */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 6626a6e64ff5..d1ccbfcf21fa 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -317,7 +317,8 @@ static int amdgpu_gem_object_open(struct drm_gem_object *obj, */ if (!vm->is_compute_context || !vm->process_info) return 0; - if (!drm_gem_is_imported(obj) || !dma_buf_is_dynamic(obj->dma_buf)) + if (!drm_gem_is_imported(obj) || + !dma_buf_is_dynamic(obj->import_attach->dmabuf)) return 0; mutex_lock_nested(&vm->process_info->lock, 1); if (!WARN_ON(!vm->process_info->eviction_fence)) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5cacf5717016..8777ed8facd6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -1276,7 +1276,7 @@ int amdgpu_vm_bo_update(struct amdgpu_device *adev, struct amdgpu_bo_va *bo_va, struct drm_gem_object *obj = &bo->tbo.base; if (drm_gem_is_imported(obj) && bo_va->is_xgmi) { - struct dma_buf *dma_buf = obj->dma_buf; + struct dma_buf *dma_buf = obj->import_attach->dmabuf; struct drm_gem_object *gobj = dma_buf->priv; struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj); -- cgit v1.2.3 From 256576ed6895b81290690ae3e2b9f62eb7c642fc Mon Sep 17 00:00:00 2001 From: Pierre-Eric Pelloux-Prayer Date: Wed, 4 Jun 2025 14:28:23 +0200 Subject: drm/amdgpu: give each kernel job a unique id Userspace jobs have drm_file.client_id as a unique identifier as job's owners. For kernel jobs, we can allocate arbitrary values - the risk of overlap with userspace ids is small (given that it's a u64 value). In the unlikely case the overlap happens, it'll only impact trace events. Since this ID is traced in the gpu_scheduler trace events, this allows to determine the source of each job sent to the hardware. To make grepping easier, the IDs are defined as they will appear in the trace output. Signed-off-by: Pierre-Eric Pelloux-Prayer Acked-by: Alex Deucher Signed-off-by: Arunpravin Paneer Selvam Link: https://lore.kernel.org/r/20250604122827.2191-1-pierre-eric.pelloux-prayer@amd.com --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 19 ++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 28 +++++++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 4 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c | 12 +++++++----- drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c | 6 ++++-- drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c | 6 ++++-- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 3 ++- 19 files changed, 84 insertions(+), 41 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index c80c8f543532..98aa99b314c9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1474,7 +1474,8 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring) owner = (void *)(unsigned long)atomic_inc_return(&counter); r = amdgpu_job_alloc_with_ib(ring->adev, &entity, owner, - 64, 0, &job); + 64, 0, &job, + AMDGPU_KERNEL_JOB_ID_CLEANER_SHADER); if (r) goto err; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 97b562a79ea8..9dcf51991b5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -690,7 +690,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, r = amdgpu_job_alloc_with_ib(ring->adev, &adev->mman.high_pr, AMDGPU_FENCE_OWNER_UNDEFINED, 16 * 4, AMDGPU_IB_POOL_IMMEDIATE, - &job); + &job, AMDGPU_KERNEL_JOB_ID_FLUSH_GPU_TLB); if (r) goto error_alloc; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9b1c55115921..d020a890a0ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -209,11 +209,12 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, struct drm_sched_entity *entity, void *owner, size_t size, enum amdgpu_ib_pool_type pool_type, - struct amdgpu_job **job) + struct amdgpu_job **job, u64 k_job_id) { int r; - r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job, 0); + r = amdgpu_job_alloc(adev, NULL, entity, owner, 1, job, + k_job_id); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 2f302266662b..4a6487eb6cb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -44,6 +44,22 @@ struct amdgpu_fence; enum amdgpu_ib_pool_type; +/* Internal kernel job ids. (decreasing values, starting from U64_MAX). */ +#define AMDGPU_KERNEL_JOB_ID_VM_UPDATE (18446744073709551615ULL) +#define AMDGPU_KERNEL_JOB_ID_VM_UPDATE_PDES (18446744073709551614ULL) +#define AMDGPU_KERNEL_JOB_ID_VM_UPDATE_RANGE (18446744073709551613ULL) +#define AMDGPU_KERNEL_JOB_ID_VM_PT_CLEAR (18446744073709551612ULL) +#define AMDGPU_KERNEL_JOB_ID_TTM_MAP_BUFFER (18446744073709551611ULL) +#define AMDGPU_KERNEL_JOB_ID_TTM_ACCESS_MEMORY_SDMA (18446744073709551610ULL) +#define AMDGPU_KERNEL_JOB_ID_TTM_COPY_BUFFER (18446744073709551609ULL) +#define AMDGPU_KERNEL_JOB_ID_CLEAR_ON_RELEASE (18446744073709551608ULL) +#define AMDGPU_KERNEL_JOB_ID_MOVE_BLIT (18446744073709551607ULL) +#define AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER (18446744073709551606ULL) +#define AMDGPU_KERNEL_JOB_ID_CLEANER_SHADER (18446744073709551605ULL) +#define AMDGPU_KERNEL_JOB_ID_FLUSH_GPU_TLB (18446744073709551604ULL) +#define AMDGPU_KERNEL_JOB_ID_KFD_GART_MAP (18446744073709551603ULL) +#define AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST (18446744073709551602ULL) + struct amdgpu_job { struct drm_sched_job base; struct amdgpu_vm *vm; @@ -96,7 +112,8 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, struct drm_sched_entity *entity, void *owner, size_t size, enum amdgpu_ib_pool_type pool_type, - struct amdgpu_job **job); + struct amdgpu_job **job, + u64 k_job_id); void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, struct amdgpu_bo *gws, struct amdgpu_bo *oa); void amdgpu_job_free_resources(struct amdgpu_job *job); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c index 82d58ac7afb0..4980595bcddd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_jpeg.c @@ -194,7 +194,8 @@ static int amdgpu_jpeg_dec_set_reg(struct amdgpu_ring *ring, uint32_t handle, int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, - AMDGPU_IB_POOL_DIRECT, &job); + AMDGPU_IB_POOL_DIRECT, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c index 122a88294883..d18bade9c98f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c @@ -1313,7 +1313,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object *bo) if (r) goto out; - r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true); + r = amdgpu_fill_buffer(abo, 0, &bo->base._resv, &fence, true, + AMDGPU_KERNEL_JOB_ID_CLEAR_ON_RELEASE); if (WARN_ON(r)) goto out; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 27ab4e754b2a..428265046815 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -226,7 +226,8 @@ static int amdgpu_ttm_map_buffer(struct ttm_buffer_object *bo, r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr, AMDGPU_FENCE_OWNER_UNDEFINED, num_dw * 4 + num_bytes, - AMDGPU_IB_POOL_DELAYED, &job); + AMDGPU_IB_POOL_DELAYED, &job, + AMDGPU_KERNEL_JOB_ID_TTM_MAP_BUFFER); if (r) return r; @@ -406,7 +407,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo, struct dma_fence *wipe_fence = NULL; r = amdgpu_fill_buffer(abo, 0, NULL, &wipe_fence, - false); + false, AMDGPU_KERNEL_JOB_ID_MOVE_BLIT); if (r) { goto error; } else if (wipe_fence) { @@ -1510,7 +1511,8 @@ static int amdgpu_ttm_access_memory_sdma(struct ttm_buffer_object *bo, r = amdgpu_job_alloc_with_ib(adev, &adev->mman.high_pr, AMDGPU_FENCE_OWNER_UNDEFINED, num_dw * 4, AMDGPU_IB_POOL_DELAYED, - &job); + &job, + AMDGPU_KERNEL_JOB_ID_TTM_ACCESS_MEMORY_SDMA); if (r) goto out; @@ -2167,7 +2169,7 @@ static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev, struct dma_resv *resv, bool vm_needs_flush, struct amdgpu_job **job, - bool delayed) + bool delayed, u64 k_job_id) { enum amdgpu_ib_pool_type pool = direct_submit ? AMDGPU_IB_POOL_DIRECT : @@ -2177,7 +2179,7 @@ static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev, &adev->mman.high_pr; r = amdgpu_job_alloc_with_ib(adev, entity, AMDGPU_FENCE_OWNER_UNDEFINED, - num_dw * 4, pool, job); + num_dw * 4, pool, job, k_job_id); if (r) return r; @@ -2217,7 +2219,8 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset, num_loops = DIV_ROUND_UP(byte_count, max_bytes); num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->copy_num_dw, 8); r = amdgpu_ttm_prepare_job(adev, direct_submit, num_dw, - resv, vm_needs_flush, &job, false); + resv, vm_needs_flush, &job, false, + AMDGPU_KERNEL_JOB_ID_TTM_COPY_BUFFER); if (r) return r; @@ -2252,7 +2255,8 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data, uint64_t dst_addr, uint32_t byte_count, struct dma_resv *resv, struct dma_fence **fence, - bool vm_needs_flush, bool delayed) + bool vm_needs_flush, bool delayed, + u64 k_job_id) { struct amdgpu_device *adev = ring->adev; unsigned int num_loops, num_dw; @@ -2265,7 +2269,7 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data, num_loops = DIV_ROUND_UP_ULL(byte_count, max_bytes); num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->fill_num_dw, 8); r = amdgpu_ttm_prepare_job(adev, false, num_dw, resv, vm_needs_flush, - &job, delayed); + &job, delayed, k_job_id); if (r) return r; @@ -2335,7 +2339,8 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, goto err; r = amdgpu_ttm_fill_mem(ring, 0, addr, size, resv, - &next, true, true); + &next, true, true, + AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); if (r) goto err; @@ -2354,7 +2359,8 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo, uint32_t src_data, struct dma_resv *resv, struct dma_fence **f, - bool delayed) + bool delayed, + u64 k_job_id) { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; @@ -2384,7 +2390,7 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo, goto error; r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size, resv, - &next, true, delayed); + &next, true, delayed, k_job_id); if (r) goto error; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h index 2309df3f68a9..d82d107fdcc6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h @@ -182,7 +182,8 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo, uint32_t src_data, struct dma_resv *resv, struct dma_fence **fence, - bool delayed); + bool delayed, + u64 k_job_id); int amdgpu_ttm_alloc_gart(struct ttm_buffer_object *bo); void amdgpu_ttm_recover_gart(struct ttm_buffer_object *tbo); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c index 74758b5ffc6c..5c38f0d30c87 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_uvd.c @@ -1136,7 +1136,8 @@ static int amdgpu_uvd_send_msg(struct amdgpu_ring *ring, struct amdgpu_bo *bo, r = amdgpu_job_alloc_with_ib(ring->adev, &adev->uvd.entity, AMDGPU_FENCE_OWNER_UNDEFINED, 64, direct ? AMDGPU_IB_POOL_DIRECT : - AMDGPU_IB_POOL_DELAYED, &job); + AMDGPU_IB_POOL_DELAYED, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c index b9060bcd4806..ce318f5de047 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vce.c @@ -449,7 +449,7 @@ static int amdgpu_vce_get_create_msg(struct amdgpu_ring *ring, uint32_t handle, r = amdgpu_job_alloc_with_ib(ring->adev, &ring->adev->vce.entity, AMDGPU_FENCE_OWNER_UNDEFINED, ib_size_dw * 4, AMDGPU_IB_POOL_DIRECT, - &job); + &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; @@ -540,7 +540,8 @@ static int amdgpu_vce_get_destroy_msg(struct amdgpu_ring *ring, uint32_t handle, AMDGPU_FENCE_OWNER_UNDEFINED, ib_size_dw * 4, direct ? AMDGPU_IB_POOL_DIRECT : - AMDGPU_IB_POOL_DELAYED, &job); + AMDGPU_IB_POOL_DELAYED, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c index f1f67521c29c..d287c44ddbc1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c @@ -601,7 +601,7 @@ static int amdgpu_vcn_dec_send_msg(struct amdgpu_ring *ring, r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, 64, AMDGPU_IB_POOL_DIRECT, - &job); + &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) goto err; @@ -781,7 +781,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, AMDGPU_IB_POOL_DIRECT, - &job); + &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) goto err; @@ -911,7 +911,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, AMDGPU_IB_POOL_DIRECT, - &job); + &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; @@ -978,7 +978,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, AMDGPU_IB_POOL_DIRECT, - &job); + &job, AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 0b87798daebd..314f11e6c78b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -977,7 +977,8 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, params.vm = vm; params.immediate = immediate; - r = vm->update_funcs->prepare(¶ms, NULL); + r = vm->update_funcs->prepare(¶ms, NULL, + AMDGPU_KERNEL_JOB_ID_VM_UPDATE_PDES); if (r) goto error; @@ -1146,7 +1147,8 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm, dma_fence_put(tmp); } - r = vm->update_funcs->prepare(¶ms, sync); + r = vm->update_funcs->prepare(¶ms, sync, + AMDGPU_KERNEL_JOB_ID_VM_UPDATE_RANGE); if (r) goto error_free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index fd086efd8457..ff2c8c6f4d7c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -308,7 +308,7 @@ struct amdgpu_vm_update_params { struct amdgpu_vm_update_funcs { int (*map_table)(struct amdgpu_bo_vm *bo); int (*prepare)(struct amdgpu_vm_update_params *p, - struct amdgpu_sync *sync); + struct amdgpu_sync *sync, u64 k_job_id); int (*update)(struct amdgpu_vm_update_params *p, struct amdgpu_bo_vm *bo, uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint64_t flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c index 0c1ef5850a5e..22e2e5b47341 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_cpu.c @@ -40,12 +40,14 @@ static int amdgpu_vm_cpu_map_table(struct amdgpu_bo_vm *table) * * @p: see amdgpu_vm_update_params definition * @sync: sync obj with fences to wait on + * @k_job_id: the id for tracing/debug purposes * * Returns: * Negativ errno, 0 for success. */ static int amdgpu_vm_cpu_prepare(struct amdgpu_vm_update_params *p, - struct amdgpu_sync *sync) + struct amdgpu_sync *sync, + u64 k_job_id) { if (!sync) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 30022123b0bf..f794fb1cc06e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -26,6 +26,7 @@ #include "amdgpu.h" #include "amdgpu_trace.h" #include "amdgpu_vm.h" +#include "amdgpu_job.h" /* * amdgpu_vm_pt_cursor - state for for_each_amdgpu_vm_pt @@ -395,7 +396,8 @@ int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm *vm, params.vm = vm; params.immediate = immediate; - r = vm->update_funcs->prepare(¶ms, NULL); + r = vm->update_funcs->prepare(¶ms, NULL, + AMDGPU_KERNEL_JOB_ID_VM_PT_CLEAR); if (r) goto exit; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c index 46d9fb433ab2..36805dcfa159 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_sdma.c @@ -40,7 +40,7 @@ static int amdgpu_vm_sdma_map_table(struct amdgpu_bo_vm *table) /* Allocate a new job for @count PTE updates */ static int amdgpu_vm_sdma_alloc_job(struct amdgpu_vm_update_params *p, - unsigned int count) + unsigned int count, u64 k_job_id) { enum amdgpu_ib_pool_type pool = p->immediate ? AMDGPU_IB_POOL_IMMEDIATE : AMDGPU_IB_POOL_DELAYED; @@ -56,7 +56,7 @@ static int amdgpu_vm_sdma_alloc_job(struct amdgpu_vm_update_params *p, ndw = min(ndw, AMDGPU_VM_SDMA_MAX_NUM_DW); r = amdgpu_job_alloc_with_ib(p->adev, entity, AMDGPU_FENCE_OWNER_VM, - ndw * 4, pool, &p->job); + ndw * 4, pool, &p->job, k_job_id); if (r) return r; @@ -69,16 +69,17 @@ static int amdgpu_vm_sdma_alloc_job(struct amdgpu_vm_update_params *p, * * @p: see amdgpu_vm_update_params definition * @sync: amdgpu_sync object with fences to wait for + * @k_job_id: identifier of the job, for tracing purpose * * Returns: * Negativ errno, 0 for success. */ static int amdgpu_vm_sdma_prepare(struct amdgpu_vm_update_params *p, - struct amdgpu_sync *sync) + struct amdgpu_sync *sync, u64 k_job_id) { int r; - r = amdgpu_vm_sdma_alloc_job(p, 0); + r = amdgpu_vm_sdma_alloc_job(p, 0, k_job_id); if (r) return r; @@ -249,7 +250,8 @@ static int amdgpu_vm_sdma_update(struct amdgpu_vm_update_params *p, if (r) return r; - r = amdgpu_vm_sdma_alloc_job(p, count); + r = amdgpu_vm_sdma_alloc_job(p, count, + AMDGPU_KERNEL_JOB_ID_VM_UPDATE); if (r) return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c index 1c07b701d0e4..ceb94bbb03a4 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c @@ -217,7 +217,8 @@ static int uvd_v6_0_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t handle int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, - AMDGPU_IB_POOL_DIRECT, &job); + AMDGPU_IB_POOL_DIRECT, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; @@ -281,7 +282,8 @@ static int uvd_v6_0_enc_get_destroy_msg(struct amdgpu_ring *ring, int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, - AMDGPU_IB_POOL_DIRECT, &job); + AMDGPU_IB_POOL_DIRECT, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c index 9d237b5937fb..1f8866f3f63c 100644 --- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c @@ -225,7 +225,8 @@ static int uvd_v7_0_enc_get_create_msg(struct amdgpu_ring *ring, u32 handle, int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, - AMDGPU_IB_POOL_DIRECT, &job); + AMDGPU_IB_POOL_DIRECT, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; @@ -288,7 +289,8 @@ static int uvd_v7_0_enc_get_destroy_msg(struct amdgpu_ring *ring, u32 handle, int i, r; r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, ib_size_dw * 4, - AMDGPU_IB_POOL_DIRECT, &job); + AMDGPU_IB_POOL_DIRECT, &job, + AMDGPU_KERNEL_JOB_ID_VCN_RING_TEST); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 79251f22b702..683ff02c45af 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -68,7 +68,8 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, AMDGPU_FENCE_OWNER_UNDEFINED, num_dw * 4 + num_bytes, AMDGPU_IB_POOL_DELAYED, - &job); + &job, + AMDGPU_KERNEL_JOB_ID_KFD_GART_MAP); if (r) return r; -- cgit v1.2.3 From 39203f5e6dcf2b0529ae526004e7a9c8ef453ae2 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 27 Aug 2025 09:28:40 +0200 Subject: drm/amdgpu: fix userq VM validation v4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was actually complete nonsense and not validating the BOs at all. The code just cleared all VM areas were it couldn't grab the lock for a BO. Try to fix this. Only compile tested at the moment. v2: fix fence slot reservation as well as pointed out by Sunil. also validate PDs, PTs, per VM BOs and update PDEs v3: grab the status_lock while working with the done list. v4: rename functions, add some comments, fix waiting for updates to complete. v4: rename amdgpu_vm_lock_done_list(), add some more comments Signed-off-by: Christian König Reviewed-by: Sunil Khatri Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 148 +++++++++++++++--------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 35 +++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 + 3 files changed, 110 insertions(+), 75 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index d2e59aecb3eb..3dfe7f5f154b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -706,108 +706,106 @@ amdgpu_userq_restore_all(struct amdgpu_userq_mgr *uq_mgr) return ret; } +static int amdgpu_userq_validate_vm(void *param, struct amdgpu_bo *bo) +{ + struct ttm_operation_ctx ctx = { false, false }; + + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); + return ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); +} + +/* Handle all BOs on the invalidated list, validate them and update the PTs */ static int -amdgpu_userq_validate_vm_bo(void *_unused, struct amdgpu_bo *bo) +amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, + struct amdgpu_vm *vm) { struct ttm_operation_ctx ctx = { false, false }; + struct amdgpu_bo_va *bo_va; + struct amdgpu_bo *bo; int ret; - amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); + spin_lock(&vm->status_lock); + while (!list_empty(&vm->invalidated)) { + bo_va = list_first_entry(&vm->invalidated, + struct amdgpu_bo_va, + base.vm_status); + spin_unlock(&vm->status_lock); - ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); - if (ret) - DRM_ERROR("Fail to validate\n"); + bo = bo_va->base.bo; + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); + if (unlikely(ret)) + return ret; - return ret; + amdgpu_bo_placement_from_domain(bo, bo->allowed_domains); + ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); + if (ret) + return ret; + + /* This moves the bo_va to the done list */ + ret = amdgpu_vm_bo_update(adev, bo_va, false); + if (ret) + return ret; + + spin_lock(&vm->status_lock); + } + spin_unlock(&vm->status_lock); + + return 0; } +/* Make sure the whole VM is ready to be used */ static int -amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr) +amdgpu_userq_vm_validate(struct amdgpu_userq_mgr *uq_mgr) { struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); - struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_device *adev = uq_mgr->adev; + struct amdgpu_vm *vm = &fpriv->vm; struct amdgpu_bo_va *bo_va; - struct ww_acquire_ctx *ticket; struct drm_exec exec; - struct amdgpu_bo *bo; - struct dma_resv *resv; - bool clear, unlock; - int ret = 0; + int ret; drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); drm_exec_until_all_locked(&exec) { - ret = amdgpu_vm_lock_pd(vm, &exec, 2); + ret = amdgpu_vm_lock_pd(vm, &exec, 1); drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) { - drm_file_err(uq_mgr->file, "Failed to lock PD\n"); + if (unlikely(ret)) goto unlock_all; - } - - /* Lock the done list */ - list_for_each_entry(bo_va, &vm->done, base.vm_status) { - bo = bo_va->base.bo; - if (!bo) - continue; - ret = drm_exec_lock_obj(&exec, &bo->tbo.base); - drm_exec_retry_on_contention(&exec); - if (unlikely(ret)) - goto unlock_all; - } - } - - spin_lock(&vm->status_lock); - while (!list_empty(&vm->moved)) { - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, - base.vm_status); - spin_unlock(&vm->status_lock); - - /* Per VM BOs never need to bo cleared in the page tables */ - ret = amdgpu_vm_bo_update(adev, bo_va, false); - if (ret) + ret = amdgpu_vm_lock_done_list(vm, &exec, 1); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) goto unlock_all; - spin_lock(&vm->status_lock); - } - - ticket = &exec.ticket; - while (!list_empty(&vm->invalidated)) { - bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, - base.vm_status); - resv = bo_va->base.bo->tbo.base.resv; - spin_unlock(&vm->status_lock); - bo = bo_va->base.bo; - ret = amdgpu_userq_validate_vm_bo(NULL, bo); - if (ret) { - drm_file_err(uq_mgr->file, "Failed to validate BO\n"); + /* This validates PDs, PTs and per VM BOs */ + ret = amdgpu_vm_validate(adev, vm, NULL, + amdgpu_userq_validate_vm, + NULL); + if (unlikely(ret)) goto unlock_all; - } - /* Try to reserve the BO to avoid clearing its ptes */ - if (!adev->debug_vm && dma_resv_trylock(resv)) { - clear = false; - unlock = true; - /* The caller is already holding the reservation lock */ - } else if (dma_resv_locking_ctx(resv) == ticket) { - clear = false; - unlock = false; - /* Somebody else is using the BO right now */ - } else { - clear = true; - unlock = false; - } + /* This locks and validates the remaining evicted BOs */ + ret = amdgpu_userq_bo_validate(adev, &exec, vm); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; + } - ret = amdgpu_vm_bo_update(adev, bo_va, clear); + ret = amdgpu_vm_handle_moved(adev, vm, NULL); + if (ret) + goto unlock_all; - if (unlock) - dma_resv_unlock(resv); - if (ret) - goto unlock_all; + ret = amdgpu_vm_update_pdes(adev, vm, false); + if (ret) + goto unlock_all; - spin_lock(&vm->status_lock); - } - spin_unlock(&vm->status_lock); + /* + * We need to wait for all VM updates to finish before restarting the + * queues. Using the done list like that is now ok since everything is + * locked in place. + */ + list_for_each_entry(bo_va, &vm->done, base.vm_status) + dma_fence_wait(bo_va->last_pt_update, false); + dma_fence_wait(vm->last_update, false); ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); if (ret) @@ -828,7 +826,7 @@ static void amdgpu_userq_restore_worker(struct work_struct *work) mutex_lock(&uq_mgr->userq_mutex); - ret = amdgpu_userq_validate_bos(uq_mgr); + ret = amdgpu_userq_vm_validate(uq_mgr); if (ret) { drm_file_err(uq_mgr->file, "Failed to validate BOs to restore\n"); goto unlock; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index bd12d8ff15a4..9980c0cded94 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -484,6 +484,41 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, 2 + num_fences); } +/** + * amdgpu_vm_lock_done_list - lock all BOs on the done list + * @exec: drm execution context + * @num_fences: number of extra fences to reserve + * + * Lock the BOs on the done list in the DRM execution context. + */ +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, + unsigned int num_fences) +{ + struct list_head *prev = &vm->done; + struct amdgpu_bo_va *bo_va; + struct amdgpu_bo *bo; + int ret; + + /* We can only trust prev->next while holding the lock */ + spin_lock(&vm->status_lock); + while (!list_is_head(prev->next, &vm->done)) { + bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); + spin_unlock(&vm->status_lock); + + bo = bo_va->base.bo; + if (bo) { + ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 1); + if (unlikely(ret)) + return ret; + } + spin_lock(&vm->status_lock); + prev = prev->next; + } + spin_unlock(&vm->status_lock); + + return 0; +} + /** * amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU * diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index e045c1590d78..3409904b5c63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -491,6 +491,8 @@ int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, unsigned int num_fences); +int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, + unsigned int num_fences); bool amdgpu_vm_ready(struct amdgpu_vm *vm); uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, -- cgit v1.2.3 From 930595df251c24e44642651a1386f7ed53cffd10 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 27 Aug 2025 10:17:48 +0200 Subject: drm/amdgpu: remove check for BO reservation add assert instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We should leave such checks to lockdep and not implement something manually. Signed-off-by: Christian König Acked-by: Sunil Khatri Reviewed-by: Prike Liang Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 9980c0cded94..d0c95fb0ef81 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -651,18 +651,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, spin_unlock(&vm->status_lock); bo = bo_base->bo; - - if (dma_resv_locking_ctx(bo->tbo.base.resv) != ticket) { - struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm); - - pr_warn_ratelimited("Evicted user BO is not reserved\n"); - if (ti) { - pr_warn_ratelimited("pid %d\n", ti->task.pid); - amdgpu_vm_put_task_info(ti); - } - - return -EINVAL; - } + dma_resv_assert_held(bo->tbo.base.resv); r = validate(param, bo); if (r) -- cgit v1.2.3 From 0aa09d8a6cab0f7d284e61eff28ac3be4d324c20 Mon Sep 17 00:00:00 2001 From: Sunil Khatri Date: Thu, 18 Sep 2025 09:33:51 +0530 Subject: drm/amdgpu: add missing comment for the new argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function 'amdgpu_vm_lock_done_list' update the comment for the new argument 'vm'. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202509180211.UAqME0zj-lkp@intel.com/ Signed-off-by: Sunil Khatri Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index d0c95fb0ef81..1ccd348bf48b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -486,6 +486,7 @@ int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, /** * amdgpu_vm_lock_done_list - lock all BOs on the done list + * @vm: vm providing the BOs * @exec: drm execution context * @num_fences: number of extra fences to reserve * -- cgit v1.2.3 From 59e4405e9ee2b318342d252422a82dd863b89ef4 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 27 Aug 2025 11:45:45 +0200 Subject: drm/amdgpu: revert to old status lock handling v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It turned out that protecting the status of each bo_va with a spinlock was just hiding problems instead of solving them. Revert the whole approach, add a separate stats_lock and lockdep assertions that the correct reservation lock is held all over the place. This not only allows for better checks if a state transition is properly protected by a lock, but also switching back to using list macros to iterate over the state of lists protected by the dma_resv lock of the root PD. v2: re-add missing check v3: split into two patches Signed-off-by: Christian König Acked-by: Sunil Khatri Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 168 ++++++++++++++---------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 15 +-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 4 - 4 files changed, 93 insertions(+), 102 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 22a5827b0772..a22e6025de61 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -724,12 +724,12 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, struct amdgpu_bo *bo; int ret; - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); bo = bo_va->base.bo; ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); @@ -746,9 +746,9 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, if (ret) return ret; - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1ccd348bf48b..1f8b43253eea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -127,6 +127,17 @@ struct amdgpu_vm_tlb_seq_struct { struct dma_fence_cb cb; }; +/** + * amdgpu_vm_assert_locked - check if VM is correctly locked + * @vm: the VM which schould be tested + * + * Asserts that the VM root PD is locked. + */ +static void amdgpu_vm_assert_locked(struct amdgpu_vm *vm) +{ + dma_resv_assert_held(vm->root.bo->tbo.base.resv); +} + /** * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping * @@ -143,6 +154,8 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, { int r; + amdgpu_vm_assert_locked(vm); + if (vm->pasid == pasid) return 0; @@ -181,12 +194,11 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) struct amdgpu_bo *bo = vm_bo->bo; vm_bo->moved = true; - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm); if (bo->tbo.type == ttm_bo_type_kernel) list_move(&vm_bo->vm_status, &vm->evicted); else list_move_tail(&vm_bo->vm_status, &vm->evicted); - spin_unlock(&vm_bo->vm->status_lock); } /** * amdgpu_vm_bo_moved - vm_bo is moved @@ -198,9 +210,8 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->moved); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -213,9 +224,8 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->idle); - spin_unlock(&vm_bo->vm->status_lock); vm_bo->moved = false; } @@ -229,9 +239,9 @@ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + spin_lock(&vm_bo->vm->invalidated_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated); - spin_unlock(&vm_bo->vm->status_lock); + spin_unlock(&vm_bo->vm->invalidated_lock); } /** @@ -244,10 +254,9 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) { + amdgpu_vm_assert_locked(vm_bo->vm); vm_bo->moved = true; - spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -260,13 +269,11 @@ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) { - if (vm_bo->bo->parent) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); + if (vm_bo->bo->parent) list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); - spin_unlock(&vm_bo->vm->status_lock); - } else { + else amdgpu_vm_bo_idle(vm_bo); - } } /** @@ -279,9 +286,8 @@ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->status_lock); + amdgpu_vm_assert_locked(vm_bo->vm); list_move(&vm_bo->vm_status, &vm_bo->vm->done); - spin_unlock(&vm_bo->vm->status_lock); } /** @@ -295,10 +301,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) { struct amdgpu_vm_bo_base *vm_bo, *tmp; - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); list_splice_init(&vm->done, &vm->invalidated); list_for_each_entry(vm_bo, &vm->invalidated, vm_status) vm_bo->moved = true; + spin_unlock(&vm->invalidated_lock); + + amdgpu_vm_assert_locked(vm_bo->vm); list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { struct amdgpu_bo *bo = vm_bo->bo; @@ -308,14 +317,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) else if (bo->parent) list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); } - spin_unlock(&vm->status_lock); } /** * amdgpu_vm_update_shared - helper to update shared memory stat * @base: base structure for tracking BO usage in a VM * - * Takes the vm status_lock and updates the shared memory stat. If the basic + * Takes the vm stats_lock and updates the shared memory stat. If the basic * stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be called * as well. */ @@ -327,7 +335,8 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) uint32_t bo_memtype = amdgpu_bo_mem_stats_placement(bo); bool shared; - spin_lock(&vm->status_lock); + dma_resv_assert_held(bo->tbo.base.resv); + spin_lock(&vm->stats_lock); shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); if (base->shared != shared) { base->shared = shared; @@ -339,7 +348,7 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) vm->stats[bo_memtype].drm.private += size; } } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -364,11 +373,11 @@ void amdgpu_vm_bo_update_shared(struct amdgpu_bo *bo) * be bo->tbo.resource * @sign: if we should add (+1) or subtract (-1) from the stat * - * Caller need to have the vm status_lock held. Useful for when multiple update + * Caller need to have the vm stats_lock held. Useful for when multiple update * need to happen at the same time. */ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, - struct ttm_resource *res, int sign) + struct ttm_resource *res, int sign) { struct amdgpu_vm *vm = base->vm; struct amdgpu_bo *bo = base->bo; @@ -392,7 +401,8 @@ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, */ if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE) vm->stats[res_memtype].drm.purgeable += size; - if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(res_memtype))) + if (!(bo->preferred_domains & + amdgpu_mem_type_to_domain(res_memtype))) vm->stats[bo_memtype].evicted += size; } } @@ -411,9 +421,9 @@ void amdgpu_vm_update_stats(struct amdgpu_vm_bo_base *base, { struct amdgpu_vm *vm = base->vm; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); amdgpu_vm_update_stats_locked(base, res, sign); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -439,10 +449,10 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, base->next = bo->vm_bo; bo->vm_bo = base; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); if (!amdgpu_vm_is_bo_always_valid(vm, bo)) return; @@ -501,10 +511,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, int ret; /* We can only trust prev->next while holding the lock */ - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); while (!list_is_head(prev->next, &vm->done)) { bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); bo = bo_va->base.bo; if (bo) { @@ -512,10 +522,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, if (unlikely(ret)) return ret; } - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); prev = prev->next; } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); return 0; } @@ -611,7 +621,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, void *param) { uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); - struct amdgpu_vm_bo_base *bo_base; + struct amdgpu_vm_bo_base *bo_base, *tmp; struct amdgpu_bo *bo; int r; @@ -624,13 +634,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, return r; } - spin_lock(&vm->status_lock); - while (!list_empty(&vm->evicted)) { - bo_base = list_first_entry(&vm->evicted, - struct amdgpu_vm_bo_base, - vm_status); - spin_unlock(&vm->status_lock); - + list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { bo = bo_base->bo; r = validate(param, bo); @@ -643,26 +647,21 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->update_funcs->map_table(to_amdgpu_bo_vm(bo)); amdgpu_vm_bo_relocated(bo_base); } - spin_lock(&vm->status_lock); } - while (ticket && !list_empty(&vm->evicted_user)) { - bo_base = list_first_entry(&vm->evicted_user, - struct amdgpu_vm_bo_base, - vm_status); - spin_unlock(&vm->status_lock); - bo = bo_base->bo; - dma_resv_assert_held(bo->tbo.base.resv); - - r = validate(param, bo); - if (r) - return r; + if (ticket) { + list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user, + vm_status) { + bo = bo_base->bo; + dma_resv_assert_held(bo->tbo.base.resv); - amdgpu_vm_bo_invalidated(bo_base); + r = validate(param, bo); + if (r) + return r; - spin_lock(&vm->status_lock); + amdgpu_vm_bo_invalidated(bo_base); + } } - spin_unlock(&vm->status_lock); amdgpu_vm_eviction_lock(vm); vm->evicting = false; @@ -685,13 +684,13 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) { bool ret; + amdgpu_vm_assert_locked(vm); + amdgpu_vm_eviction_lock(vm); ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); - spin_lock(&vm->status_lock); ret &= list_empty(&vm->evicted); - spin_unlock(&vm->status_lock); spin_lock(&vm->immediate.lock); ret &= !vm->immediate.stopped; @@ -982,16 +981,13 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, struct amdgpu_vm *vm, bool immediate) { struct amdgpu_vm_update_params params; - struct amdgpu_vm_bo_base *entry; + struct amdgpu_vm_bo_base *entry, *tmp; bool flush_tlb_needed = false; - LIST_HEAD(relocated); int r, idx; - spin_lock(&vm->status_lock); - list_splice_init(&vm->relocated, &relocated); - spin_unlock(&vm->status_lock); + amdgpu_vm_assert_locked(vm); - if (list_empty(&relocated)) + if (list_empty(&vm->relocated)) return 0; if (!drm_dev_enter(adev_to_drm(adev), &idx)) @@ -1006,7 +1002,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (r) goto error; - list_for_each_entry(entry, &relocated, vm_status) { + list_for_each_entry(entry, &vm->relocated, vm_status) { /* vm_flush_needed after updating moved PDEs */ flush_tlb_needed |= entry->moved; @@ -1022,9 +1018,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (flush_tlb_needed) atomic64_inc(&vm->tlb_seq); - while (!list_empty(&relocated)) { - entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base, - vm_status); + list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) { amdgpu_vm_bo_idle(entry); } @@ -1250,9 +1244,9 @@ error_free: void amdgpu_vm_get_memory(struct amdgpu_vm *vm, struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]) { - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } /** @@ -1619,29 +1613,24 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) { - struct amdgpu_bo_va *bo_va; + struct amdgpu_bo_va *bo_va, *tmp; struct dma_resv *resv; bool clear, unlock; int r; - spin_lock(&vm->status_lock); - while (!list_empty(&vm->moved)) { - bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, - base.vm_status); - spin_unlock(&vm->status_lock); - + list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) { /* Per VM BOs never need to bo cleared in the page tables */ r = amdgpu_vm_bo_update(adev, bo_va, false); if (r) return r; - spin_lock(&vm->status_lock); } + spin_lock(&vm->invalidated_lock); while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); resv = bo_va->base.bo->tbo.base.resv; - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); /* Try to reserve the BO to avoid clearing its ptes */ if (!adev->debug_vm && dma_resv_trylock(resv)) { @@ -1673,9 +1662,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) amdgpu_vm_bo_evicted_user(&bo_va->base); - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); } - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); return 0; } @@ -2204,9 +2193,9 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, } } - spin_lock(&vm->status_lock); + spin_lock(&vm->invalidated_lock); list_del(&bo_va->base.vm_status); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->invalidated_lock); list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { list_del(&mapping->list); @@ -2314,10 +2303,10 @@ void amdgpu_vm_bo_move(struct amdgpu_bo *bo, struct ttm_resource *new_mem, for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { struct amdgpu_vm *vm = bo_base->vm; - spin_lock(&vm->status_lock); + spin_lock(&vm->stats_lock); amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1); amdgpu_vm_update_stats_locked(bo_base, new_mem, +1); - spin_unlock(&vm->status_lock); + spin_unlock(&vm->stats_lock); } amdgpu_vm_bo_invalidate(bo, evicted); @@ -2584,11 +2573,12 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_LIST_HEAD(&vm->relocated); INIT_LIST_HEAD(&vm->moved); INIT_LIST_HEAD(&vm->idle); + spin_lock_init(&vm->invalidated_lock); INIT_LIST_HEAD(&vm->invalidated); - spin_lock_init(&vm->status_lock); INIT_LIST_HEAD(&vm->freed); INIT_LIST_HEAD(&vm->done); INIT_KFIFO(vm->faults); + spin_lock_init(&vm->stats_lock); r = amdgpu_vm_init_entities(adev, vm); if (r) @@ -3053,7 +3043,8 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) unsigned int total_done_objs = 0; unsigned int id = 0; - spin_lock(&vm->status_lock); + amdgpu_vm_assert_locked(vm); + seq_puts(m, "\tIdle BOs:\n"); list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { if (!bo_va->base.bo) @@ -3091,11 +3082,13 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) id = 0; seq_puts(m, "\tInvalidated BOs:\n"); + spin_lock(&vm->invalidated_lock); list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) { if (!bo_va->base.bo) continue; total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } + spin_unlock(&vm->invalidated_lock); total_invalidated_objs = id; id = 0; @@ -3105,7 +3098,6 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) continue; total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } - spin_unlock(&vm->status_lock); total_done_objs = id; seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 74e61e45778e..829b400cb8c0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -203,11 +203,11 @@ struct amdgpu_vm_bo_base { /* protected by bo being reserved */ struct amdgpu_vm_bo_base *next; - /* protected by vm status_lock */ + /* protected by vm reservation and invalidated_lock */ struct list_head vm_status; /* if the bo is counted as shared in mem stats - * protected by vm status_lock */ + * protected by vm BO being reserved */ bool shared; /* protected by the BO being reserved */ @@ -343,10 +343,8 @@ struct amdgpu_vm { bool evicting; unsigned int saved_flags; - /* Lock to protect vm_bo add/del/move on all lists of vm */ - spinlock_t status_lock; - - /* Memory statistics for this vm, protected by status_lock */ + /* Memory statistics for this vm, protected by stats_lock */ + spinlock_t stats_lock; struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]; /* @@ -354,6 +352,8 @@ struct amdgpu_vm { * PDs, PTs or per VM BOs. The state transits are: * * evicted -> relocated (PDs, PTs) or moved (per VM BOs) -> idle + * + * Lists are protected by the root PD dma_resv lock. */ /* Per-VM and PT BOs who needs a validation */ @@ -374,7 +374,10 @@ struct amdgpu_vm { * state transits are: * * evicted_user or invalidated -> done + * + * Lists are protected by the invalidated_lock. */ + spinlock_t invalidated_lock; /* BOs for user mode queues that need a validation */ struct list_head evicted_user; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 30022123b0bf..f57c48b74274 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -541,9 +541,7 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) entry->bo->vm_bo = NULL; ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); - spin_lock(&entry->vm->status_lock); list_del(&entry->vm_status); - spin_unlock(&entry->vm->status_lock); amdgpu_bo_unref(&entry->bo); } @@ -587,7 +585,6 @@ static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params, struct amdgpu_vm_pt_cursor seek; struct amdgpu_vm_bo_base *entry; - spin_lock(¶ms->vm->status_lock); for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) { if (entry && entry->bo) list_move(&entry->vm_status, ¶ms->tlb_flush_waitlist); @@ -595,7 +592,6 @@ static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params, /* enter start node now */ list_move(&cursor->entry->vm_status, ¶ms->tlb_flush_waitlist); - spin_unlock(¶ms->vm->status_lock); } /** -- cgit v1.2.3 From 90e09ea4cfd4aaaf07ababa6d8c880035587e7e9 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 19 Sep 2025 09:27:03 +0200 Subject: drm/amdgpu: revert "rework reserved VMID handling" v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e44a0fe630c58b0a87d8281f5c1077a3479e5fce. Initially we used VMID reservation to enforce isolation between processes. That has now been replaced by proper fence handling. Both OpenGL, RADV and ROCm developers requested a way to reserve a VMID for SPM, so restore that approach by reverting back to only allowing a single process to use the reserved VMID. Only compile tested for now. v2: use -ENOENT instead of -EINVAL if VMID is not available Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 61 +++++++++++++++++++++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h | 11 +++--- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 +++------ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 2 +- 4 files changed, 50 insertions(+), 41 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index cbdf108612d2..3ef5bc95642c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -275,13 +275,12 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, { struct amdgpu_device *adev = ring->adev; unsigned vmhub = ring->vm_hub; - struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; uint64_t fence_context = adev->fence_context + ring->idx; bool needs_flush = vm->use_cpu_for_update; uint64_t updates = amdgpu_vm_tlb_seq(vm); int r; - *id = id_mgr->reserved; + *id = vm->reserved_vmid[vmhub]; if ((*id)->owner != vm->immediate.fence_context || !amdgpu_vmid_compatible(*id, job) || (*id)->flushed_updates < updates || @@ -474,40 +473,61 @@ bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub) return vm->reserved_vmid[vmhub]; } -int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, +/* + * amdgpu_vmid_alloc_reserved - reserve a specific VMID for this vm + * @adev: amdgpu device structure + * @vm: the VM to reserve an ID for + * @vmhub: the VMHUB which should be used + * + * Mostly used to have a reserved VMID for debugging and SPM. + * + * Returns: 0 for success, -ENOENT if an ID is already reserved. + */ +int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned vmhub) { struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; + struct amdgpu_vmid *id; + int r = 0; mutex_lock(&id_mgr->lock); - - ++id_mgr->reserved_use_count; - if (!id_mgr->reserved) { - struct amdgpu_vmid *id; - - id = list_first_entry(&id_mgr->ids_lru, struct amdgpu_vmid, - list); - /* Remove from normal round robin handling */ - list_del_init(&id->list); - id_mgr->reserved = id; + if (vm->reserved_vmid[vmhub]) + goto unlock; + if (id_mgr->reserved_vmid) { + r = -ENOENT; + goto unlock; } - + /* Remove from normal round robin handling */ + id = list_first_entry(&id_mgr->ids_lru, struct amdgpu_vmid, list); + list_del_init(&id->list); + vm->reserved_vmid[vmhub] = id; + id_mgr->reserved_vmid = true; mutex_unlock(&id_mgr->lock); + return 0; +unlock: + mutex_unlock(&id_mgr->lock); + return r; } -void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, +/* + * amdgpu_vmid_free_reserved - free up a reserved VMID again + * @adev: amdgpu device structure + * @vm: the VM with the reserved ID + * @vmhub: the VMHUB which should be used + */ +void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, struct amdgpu_vm *vm, unsigned vmhub) { struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; mutex_lock(&id_mgr->lock); - if (!--id_mgr->reserved_use_count) { - /* give the reserved ID back to normal round robin */ - list_add(&id_mgr->reserved->list, &id_mgr->ids_lru); - id_mgr->reserved = NULL; + if (vm->reserved_vmid[vmhub]) { + list_add(&vm->reserved_vmid[vmhub]->list, + &id_mgr->ids_lru); + vm->reserved_vmid[vmhub] = NULL; + id_mgr->reserved_vmid = false; } - mutex_unlock(&id_mgr->lock); } @@ -574,7 +594,6 @@ void amdgpu_vmid_mgr_init(struct amdgpu_device *adev) mutex_init(&id_mgr->lock); INIT_LIST_HEAD(&id_mgr->ids_lru); - id_mgr->reserved_use_count = 0; /* for GC <10, SDMA uses MMHUB so use first_kfd_vmid for both GC and MM */ if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(10, 0, 0)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h index 240fa6751260..b3649cd3af56 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.h @@ -67,8 +67,7 @@ struct amdgpu_vmid_mgr { unsigned num_ids; struct list_head ids_lru; struct amdgpu_vmid ids[AMDGPU_NUM_VMID]; - struct amdgpu_vmid *reserved; - unsigned int reserved_use_count; + bool reserved_vmid; }; int amdgpu_pasid_alloc(unsigned int bits); @@ -79,10 +78,10 @@ void amdgpu_pasid_free_delayed(struct dma_resv *resv, bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev, struct amdgpu_vmid *id); bool amdgpu_vmid_uses_reserved(struct amdgpu_vm *vm, unsigned int vmhub); -int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, - unsigned vmhub); -void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, - unsigned vmhub); +int amdgpu_vmid_alloc_reserved(struct amdgpu_device *adev, struct amdgpu_vm *vm, + unsigned vmhub); +void amdgpu_vmid_free_reserved(struct amdgpu_device *adev, struct amdgpu_vm *vm, + unsigned vmhub); int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_job *job, struct dma_fence **fence); void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 112ce584a5ad..8c28e8923f02 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -2790,10 +2790,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) dma_fence_put(vm->last_update); for (i = 0; i < AMDGPU_MAX_VMHUBS; i++) { - if (vm->reserved_vmid[i]) { - amdgpu_vmid_free_reserved(adev, i); - vm->reserved_vmid[i] = false; - } + amdgpu_vmid_free_reserved(adev, vm, i); } ttm_lru_bulk_move_fini(&adev->mman.bdev, &vm->lru_bulk_move); @@ -2889,6 +2886,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) union drm_amdgpu_vm *args = data; struct amdgpu_device *adev = drm_to_adev(dev); struct amdgpu_fpriv *fpriv = filp->driver_priv; + struct amdgpu_vm *vm = &fpriv->vm; /* No valid flags defined yet */ if (args->in.flags) @@ -2897,17 +2895,10 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) switch (args->in.op) { case AMDGPU_VM_OP_RESERVE_VMID: /* We only have requirement to reserve vmid from gfxhub */ - if (!fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) { - amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(0)); - fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = true; - } - + amdgpu_vmid_alloc_reserved(adev, vm, AMDGPU_GFXHUB(0)); break; case AMDGPU_VM_OP_UNRESERVE_VMID: - if (fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)]) { - amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(0)); - fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(0)] = false; - } + amdgpu_vmid_free_reserved(adev, vm, AMDGPU_GFXHUB(0)); break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 988e970d9e96..adc5c9161fa8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -415,7 +415,7 @@ struct amdgpu_vm { struct dma_fence *last_unlocked; unsigned int pasid; - bool reserved_vmid[AMDGPU_MAX_VMHUBS]; + struct amdgpu_vmid *reserved_vmid[AMDGPU_MAX_VMHUBS]; /* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */ bool use_cpu_for_update; -- cgit v1.2.3 From b809ca91a5b7eccba065460200512f83cc87987a Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Thu, 25 Sep 2025 18:02:18 +0800 Subject: drm/amdgpu: Merge amdgpu_vm_set_pasid into amdgpu_vm_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As KFD no longer uses a separate PASID, the global amdgpu_vm_set_pasid()function is no longer necessary. Merge its functionality directly intoamdgpu_vm_init() to simplify code flow and eliminate redundant locking. v2: remove superflous check adjust amdgpu_vm_fin and remove amdgpu_vm_set_pasid (Chritian) v3: drop amdgpu_vm_assert_locked (Chritian) Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4614 Fixes: 59e4405e9ee2 ("drm/amdgpu: revert to old status lock handling v3") Reviewed-by: Christian König Suggested-by: Christian König Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 10 +---- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 65 +++++++++++---------------------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 5 +-- 3 files changed, 24 insertions(+), 56 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 8676400834fc..a9327472c651 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1421,14 +1421,10 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) amdgpu_debugfs_vm_init(file_priv); - r = amdgpu_vm_init(adev, &fpriv->vm, fpriv->xcp_id); + r = amdgpu_vm_init(adev, &fpriv->vm, fpriv->xcp_id, pasid); if (r) goto error_pasid; - r = amdgpu_vm_set_pasid(adev, &fpriv->vm, pasid); - if (r) - goto error_vm; - fpriv->prt_va = amdgpu_vm_bo_add(adev, &fpriv->vm, NULL); if (!fpriv->prt_va) { r = -ENOMEM; @@ -1468,10 +1464,8 @@ error_vm: amdgpu_vm_fini(adev, &fpriv->vm); error_pasid: - if (pasid) { + if (pasid) amdgpu_pasid_free(pasid); - amdgpu_vm_set_pasid(adev, &fpriv->vm, 0); - } kfree(fpriv); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 8c28e8923f02..643d2be9065b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -138,48 +138,6 @@ static void amdgpu_vm_assert_locked(struct amdgpu_vm *vm) dma_resv_assert_held(vm->root.bo->tbo.base.resv); } -/** - * amdgpu_vm_set_pasid - manage pasid and vm ptr mapping - * - * @adev: amdgpu_device pointer - * @vm: amdgpu_vm pointer - * @pasid: the pasid the VM is using on this GPU - * - * Set the pasid this VM is using on this GPU, can also be used to remove the - * pasid by passing in zero. - * - */ -int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, - u32 pasid) -{ - int r; - - amdgpu_vm_assert_locked(vm); - - if (vm->pasid == pasid) - return 0; - - if (vm->pasid) { - r = xa_err(xa_erase_irq(&adev->vm_manager.pasids, vm->pasid)); - if (r < 0) - return r; - - vm->pasid = 0; - } - - if (pasid) { - r = xa_err(xa_store_irq(&adev->vm_manager.pasids, pasid, vm, - GFP_KERNEL)); - if (r < 0) - return r; - - vm->pasid = pasid; - } - - - return 0; -} - /** * amdgpu_vm_bo_evicted - vm_bo is evicted * @@ -2554,6 +2512,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) * @adev: amdgpu_device pointer * @vm: requested vm * @xcp_id: GPU partition selection id + * @pasid: the pasid the VM is using on this GPU * * Init @vm fields. * @@ -2561,7 +2520,7 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) * 0 for success, error for failure. */ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, - int32_t xcp_id) + int32_t xcp_id, uint32_t pasid) { struct amdgpu_bo *root_bo; struct amdgpu_bo_vm *root; @@ -2638,12 +2597,26 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, if (r) dev_dbg(adev->dev, "Failed to create task info for VM\n"); + /* Store new PASID in XArray (if non-zero) */ + if (pasid != 0) { + r = xa_err(xa_store_irq(&adev->vm_manager.pasids, pasid, vm, GFP_KERNEL)); + if (r < 0) + goto error_free_root; + + vm->pasid = pasid; + } + amdgpu_bo_unreserve(vm->root.bo); amdgpu_bo_unref(&root_bo); return 0; error_free_root: + /* If PASID was partially set, erase it from XArray before failing */ + if (vm->pasid != 0) { + xa_erase_irq(&adev->vm_manager.pasids, vm->pasid); + vm->pasid = 0; + } amdgpu_vm_pt_free_root(adev, vm); amdgpu_bo_unreserve(vm->root.bo); amdgpu_bo_unref(&root_bo); @@ -2749,7 +2722,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) root = amdgpu_bo_ref(vm->root.bo); amdgpu_bo_reserve(root, true); - amdgpu_vm_set_pasid(adev, vm, 0); + /* Remove PASID mapping before destroying VM */ + if (vm->pasid != 0) { + xa_erase_irq(&adev->vm_manager.pasids, vm->pasid); + vm->pasid = 0; + } dma_fence_wait(vm->last_unlocked, false); dma_fence_put(vm->last_unlocked); dma_fence_wait(vm->last_tlb_flush, false); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index adc5c9161fa8..f15cef8fa58a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -503,11 +503,8 @@ extern const struct amdgpu_vm_update_funcs amdgpu_vm_sdma_funcs; void amdgpu_vm_manager_init(struct amdgpu_device *adev); void amdgpu_vm_manager_fini(struct amdgpu_device *adev); -int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, - u32 pasid); - long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout); -int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id); +int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t xcp_id, uint32_t pasid); int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm); void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_lock_pd(struct amdgpu_vm *vm, struct drm_exec *exec, -- cgit v1.2.3 From 8d557eab3a396c5c0068d7b4ad920f2bf261e484 Mon Sep 17 00:00:00 2001 From: "Jesse.Zhang" Date: Mon, 29 Sep 2025 16:44:28 +0800 Subject: drm/amdgpu: Fix general protection fault in amdgpu_vm_bo_reset_state_machine MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After GPU reset with VRAM loss, a general protection fault occurs during user queue restoration when accessing vm_bo->vm after spinlock release in amdgpu_vm_bo_reset_state_machine. The root cause is that vm_bo points to the last entry from the list_for_each_entry loop, but this becomes invalid after the spinlock is released. Accessing vm_bo->vm at this point leads to memory corruption. Crash log shows: [ 326.981811] Oops: general protection fault, probably for non-canonical address 0x4156415741e58ac8: 0000 [#1] SMP NOPTI [ 326.981820] CPU: 13 UID: 0 PID: 1035 Comm: kworker/13:3 Tainted: G E 6.16.0+ #25 PREEMPT(voluntary) [ 326.981826] Tainted: [E]=UNSIGNED_MODULE [ 326.981827] Hardware name: Gigabyte Technology Co., Ltd. X870E AORUS PRO ICE/X870E AORUS PRO ICE, BIOS F3i 12/19/2024 [ 326.981831] Workqueue: events amdgpu_userq_restore_worker [amdgpu] [ 326.981999] RIP: 0010:amdgpu_vm_assert_locked+0x16/0x70 [amdgpu] [ 326.982094] Code: 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 85 ff 74 45 48 8b 87 80 03 00 00 48 85 c0 74 40 <48> 8b b8 80 01 00 00 48 85 ff 74 3b 8b 05 0c b7 0e f0 85 c0 75 05 [ 326.982098] RSP: 0018:ffffaa91c2a6bc20 EFLAGS: 00010206 [ 326.982100] RAX: 4156415741e58948 RBX: ffff9e8f013e8330 RCX: 0000000000000000 [ 326.982102] RDX: 0000000000000005 RSI: 000000001d254e88 RDI: ffffffffc144814a [ 326.982104] RBP: ffffaa91c2a6bc68 R08: 0000004c21a25674 R09: 0000000000000001 [ 326.982106] R10: 0000000000000001 R11: dccaf3f2f82863fc R12: ffff9e8f013e8000 [ 326.982108] R13: ffff9e8f013e8000 R14: 0000000000000000 R15: ffff9e8f09980000 [ 326.982110] FS: 0000000000000000(0000) GS:ffff9e9e79995000(0000) knlGS:0000000000000000 [ 326.982112] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 326.982114] CR2: 000055ed6c9caa80 CR3: 0000000797060000 CR4: 0000000000750ef0 [ 326.982116] PKRU: 55555554 Reviewed-by: Christian König Signed-off-by: Jesse Zhang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 643d2be9065b..2823208d8624 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -265,7 +265,7 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) vm_bo->moved = true; spin_unlock(&vm->invalidated_lock); - amdgpu_vm_assert_locked(vm_bo->vm); + amdgpu_vm_assert_locked(vm); list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { struct amdgpu_bo *bo = vm_bo->bo; -- cgit v1.2.3 From a107aeb6a2150dd552673caefc771e2222d584de Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 6 Oct 2025 12:45:25 +0200 Subject: drm/amdgpu: partially revert "revert to old status lock handling v3" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI systems are pointing out list corruptions, so we still need to fix something here. Keep the asserts, but revert the lock changes for now. Fixes: 59e4405e9ee2 ("drm/amdgpu: revert to old status lock handling v3") Signed-off-by: Christian König Acked-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 8 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 146 +++++++++++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 15 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 4 + 4 files changed, 105 insertions(+), 68 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 48e0932f5b62..1add21160d21 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -726,12 +726,12 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, struct amdgpu_bo *bo; int ret; - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); bo = bo_va->base.bo; ret = drm_exec_prepare_obj(exec, &bo->tbo.base, 2); @@ -748,9 +748,9 @@ amdgpu_userq_bo_validate(struct amdgpu_device *adev, struct drm_exec *exec, if (ret) return ret; - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); } - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 2823208d8624..c1a801203949 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -153,10 +153,12 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) vm_bo->moved = true; amdgpu_vm_assert_locked(vm); + spin_lock(&vm_bo->vm->status_lock); if (bo->tbo.type == ttm_bo_type_kernel) list_move(&vm_bo->vm_status, &vm->evicted); else list_move_tail(&vm_bo->vm_status, &vm->evicted); + spin_unlock(&vm_bo->vm->status_lock); } /** * amdgpu_vm_bo_moved - vm_bo is moved @@ -169,7 +171,9 @@ static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo) static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) { amdgpu_vm_assert_locked(vm_bo->vm); + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->moved); + spin_unlock(&vm_bo->vm->status_lock); } /** @@ -183,7 +187,9 @@ static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo) static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) { amdgpu_vm_assert_locked(vm_bo->vm); + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->idle); + spin_unlock(&vm_bo->vm->status_lock); vm_bo->moved = false; } @@ -197,9 +203,9 @@ static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) { - spin_lock(&vm_bo->vm->invalidated_lock); + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->invalidated); - spin_unlock(&vm_bo->vm->invalidated_lock); + spin_unlock(&vm_bo->vm->status_lock); } /** @@ -212,9 +218,10 @@ static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo) */ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) { - amdgpu_vm_assert_locked(vm_bo->vm); vm_bo->moved = true; + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->evicted_user); + spin_unlock(&vm_bo->vm->status_lock); } /** @@ -228,10 +235,13 @@ static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo) static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) { amdgpu_vm_assert_locked(vm_bo->vm); - if (vm_bo->bo->parent) + if (vm_bo->bo->parent) { + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); - else + spin_unlock(&vm_bo->vm->status_lock); + } else { amdgpu_vm_bo_idle(vm_bo); + } } /** @@ -245,7 +255,9 @@ static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo) static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) { amdgpu_vm_assert_locked(vm_bo->vm); + spin_lock(&vm_bo->vm->status_lock); list_move(&vm_bo->vm_status, &vm_bo->vm->done); + spin_unlock(&vm_bo->vm->status_lock); } /** @@ -259,13 +271,13 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) { struct amdgpu_vm_bo_base *vm_bo, *tmp; - spin_lock(&vm->invalidated_lock); + amdgpu_vm_assert_locked(vm); + + spin_lock(&vm->status_lock); list_splice_init(&vm->done, &vm->invalidated); list_for_each_entry(vm_bo, &vm->invalidated, vm_status) vm_bo->moved = true; - spin_unlock(&vm->invalidated_lock); - amdgpu_vm_assert_locked(vm); list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) { struct amdgpu_bo *bo = vm_bo->bo; @@ -275,13 +287,14 @@ static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm) else if (bo->parent) list_move(&vm_bo->vm_status, &vm_bo->vm->relocated); } + spin_unlock(&vm->status_lock); } /** * amdgpu_vm_update_shared - helper to update shared memory stat * @base: base structure for tracking BO usage in a VM * - * Takes the vm stats_lock and updates the shared memory stat. If the basic + * Takes the vm status_lock and updates the shared memory stat. If the basic * stat changed (e.g. buffer was moved) amdgpu_vm_update_stats need to be called * as well. */ @@ -294,7 +307,7 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) bool shared; dma_resv_assert_held(bo->tbo.base.resv); - spin_lock(&vm->stats_lock); + spin_lock(&vm->status_lock); shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); if (base->shared != shared) { base->shared = shared; @@ -306,7 +319,7 @@ static void amdgpu_vm_update_shared(struct amdgpu_vm_bo_base *base) vm->stats[bo_memtype].drm.private += size; } } - spin_unlock(&vm->stats_lock); + spin_unlock(&vm->status_lock); } /** @@ -331,11 +344,11 @@ void amdgpu_vm_bo_update_shared(struct amdgpu_bo *bo) * be bo->tbo.resource * @sign: if we should add (+1) or subtract (-1) from the stat * - * Caller need to have the vm stats_lock held. Useful for when multiple update + * Caller need to have the vm status_lock held. Useful for when multiple update * need to happen at the same time. */ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, - struct ttm_resource *res, int sign) + struct ttm_resource *res, int sign) { struct amdgpu_vm *vm = base->vm; struct amdgpu_bo *bo = base->bo; @@ -359,8 +372,7 @@ static void amdgpu_vm_update_stats_locked(struct amdgpu_vm_bo_base *base, */ if (bo->flags & AMDGPU_GEM_CREATE_DISCARDABLE) vm->stats[res_memtype].drm.purgeable += size; - if (!(bo->preferred_domains & - amdgpu_mem_type_to_domain(res_memtype))) + if (!(bo->preferred_domains & amdgpu_mem_type_to_domain(res_memtype))) vm->stats[bo_memtype].evicted += size; } } @@ -379,9 +391,9 @@ void amdgpu_vm_update_stats(struct amdgpu_vm_bo_base *base, { struct amdgpu_vm *vm = base->vm; - spin_lock(&vm->stats_lock); + spin_lock(&vm->status_lock); amdgpu_vm_update_stats_locked(base, res, sign); - spin_unlock(&vm->stats_lock); + spin_unlock(&vm->status_lock); } /** @@ -407,10 +419,10 @@ void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base, base->next = bo->vm_bo; bo->vm_bo = base; - spin_lock(&vm->stats_lock); + spin_lock(&vm->status_lock); base->shared = drm_gem_object_is_shared_for_memory_stats(&bo->tbo.base); amdgpu_vm_update_stats_locked(base, bo->tbo.resource, +1); - spin_unlock(&vm->stats_lock); + spin_unlock(&vm->status_lock); if (!amdgpu_vm_is_bo_always_valid(vm, bo)) return; @@ -469,10 +481,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, int ret; /* We can only trust prev->next while holding the lock */ - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); while (!list_is_head(prev->next, &vm->done)) { bo_va = list_entry(prev->next, typeof(*bo_va), base.vm_status); - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); bo = bo_va->base.bo; if (bo) { @@ -480,10 +492,10 @@ int amdgpu_vm_lock_done_list(struct amdgpu_vm *vm, struct drm_exec *exec, if (unlikely(ret)) return ret; } - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); prev = prev->next; } - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); return 0; } @@ -579,7 +591,7 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, void *param) { uint64_t new_vm_generation = amdgpu_vm_generation(adev, vm); - struct amdgpu_vm_bo_base *bo_base, *tmp; + struct amdgpu_vm_bo_base *bo_base; struct amdgpu_bo *bo; int r; @@ -592,7 +604,13 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, return r; } - list_for_each_entry_safe(bo_base, tmp, &vm->evicted, vm_status) { + spin_lock(&vm->status_lock); + while (!list_empty(&vm->evicted)) { + bo_base = list_first_entry(&vm->evicted, + struct amdgpu_vm_bo_base, + vm_status); + spin_unlock(&vm->status_lock); + bo = bo_base->bo; r = validate(param, bo); @@ -605,21 +623,26 @@ int amdgpu_vm_validate(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->update_funcs->map_table(to_amdgpu_bo_vm(bo)); amdgpu_vm_bo_relocated(bo_base); } + spin_lock(&vm->status_lock); } + while (ticket && !list_empty(&vm->evicted_user)) { + bo_base = list_first_entry(&vm->evicted_user, + struct amdgpu_vm_bo_base, + vm_status); + spin_unlock(&vm->status_lock); - if (ticket) { - list_for_each_entry_safe(bo_base, tmp, &vm->evicted_user, - vm_status) { - bo = bo_base->bo; - dma_resv_assert_held(bo->tbo.base.resv); + bo = bo_base->bo; + dma_resv_assert_held(bo->tbo.base.resv); - r = validate(param, bo); - if (r) - return r; + r = validate(param, bo); + if (r) + return r; - amdgpu_vm_bo_invalidated(bo_base); - } + amdgpu_vm_bo_invalidated(bo_base); + + spin_lock(&vm->status_lock); } + spin_unlock(&vm->status_lock); amdgpu_vm_eviction_lock(vm); vm->evicting = false; @@ -648,7 +671,9 @@ bool amdgpu_vm_ready(struct amdgpu_vm *vm) ret = !vm->evicting; amdgpu_vm_eviction_unlock(vm); + spin_lock(&vm->status_lock); ret &= list_empty(&vm->evicted); + spin_unlock(&vm->status_lock); spin_lock(&vm->immediate.lock); ret &= !vm->immediate.stopped; @@ -939,13 +964,18 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, struct amdgpu_vm *vm, bool immediate) { struct amdgpu_vm_update_params params; - struct amdgpu_vm_bo_base *entry, *tmp; + struct amdgpu_vm_bo_base *entry; bool flush_tlb_needed = false; + LIST_HEAD(relocated); int r, idx; amdgpu_vm_assert_locked(vm); - if (list_empty(&vm->relocated)) + spin_lock(&vm->status_lock); + list_splice_init(&vm->relocated, &relocated); + spin_unlock(&vm->status_lock); + + if (list_empty(&relocated)) return 0; if (!drm_dev_enter(adev_to_drm(adev), &idx)) @@ -961,7 +991,7 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (r) goto error; - list_for_each_entry(entry, &vm->relocated, vm_status) { + list_for_each_entry(entry, &relocated, vm_status) { /* vm_flush_needed after updating moved PDEs */ flush_tlb_needed |= entry->moved; @@ -977,7 +1007,9 @@ int amdgpu_vm_update_pdes(struct amdgpu_device *adev, if (flush_tlb_needed) atomic64_inc(&vm->tlb_seq); - list_for_each_entry_safe(entry, tmp, &vm->relocated, vm_status) { + while (!list_empty(&relocated)) { + entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base, + vm_status); amdgpu_vm_bo_idle(entry); } @@ -1204,9 +1236,9 @@ error_free: void amdgpu_vm_get_memory(struct amdgpu_vm *vm, struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]) { - spin_lock(&vm->stats_lock); + spin_lock(&vm->status_lock); memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM); - spin_unlock(&vm->stats_lock); + spin_unlock(&vm->status_lock); } /** @@ -1573,24 +1605,29 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, struct amdgpu_vm *vm, struct ww_acquire_ctx *ticket) { - struct amdgpu_bo_va *bo_va, *tmp; + struct amdgpu_bo_va *bo_va; struct dma_resv *resv; bool clear, unlock; int r; - list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) { + spin_lock(&vm->status_lock); + while (!list_empty(&vm->moved)) { + bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va, + base.vm_status); + spin_unlock(&vm->status_lock); + /* Per VM BOs never need to bo cleared in the page tables */ r = amdgpu_vm_bo_update(adev, bo_va, false); if (r) return r; + spin_lock(&vm->status_lock); } - spin_lock(&vm->invalidated_lock); while (!list_empty(&vm->invalidated)) { bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va, base.vm_status); resv = bo_va->base.bo->tbo.base.resv; - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); /* Try to reserve the BO to avoid clearing its ptes */ if (!adev->debug_vm && dma_resv_trylock(resv)) { @@ -1622,9 +1659,9 @@ int amdgpu_vm_handle_moved(struct amdgpu_device *adev, bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM)) amdgpu_vm_bo_evicted_user(&bo_va->base); - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); } - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); return 0; } @@ -2153,9 +2190,9 @@ void amdgpu_vm_bo_del(struct amdgpu_device *adev, } } - spin_lock(&vm->invalidated_lock); + spin_lock(&vm->status_lock); list_del(&bo_va->base.vm_status); - spin_unlock(&vm->invalidated_lock); + spin_unlock(&vm->status_lock); list_for_each_entry_safe(mapping, next, &bo_va->valids, list) { list_del(&mapping->list); @@ -2263,10 +2300,10 @@ void amdgpu_vm_bo_move(struct amdgpu_bo *bo, struct ttm_resource *new_mem, for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) { struct amdgpu_vm *vm = bo_base->vm; - spin_lock(&vm->stats_lock); + spin_lock(&vm->status_lock); amdgpu_vm_update_stats_locked(bo_base, bo->tbo.resource, -1); amdgpu_vm_update_stats_locked(bo_base, new_mem, +1); - spin_unlock(&vm->stats_lock); + spin_unlock(&vm->status_lock); } amdgpu_vm_bo_invalidate(bo, evicted); @@ -2534,12 +2571,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, INIT_LIST_HEAD(&vm->relocated); INIT_LIST_HEAD(&vm->moved); INIT_LIST_HEAD(&vm->idle); - spin_lock_init(&vm->invalidated_lock); INIT_LIST_HEAD(&vm->invalidated); + spin_lock_init(&vm->status_lock); INIT_LIST_HEAD(&vm->freed); INIT_LIST_HEAD(&vm->done); INIT_KFIFO(vm->faults); - spin_lock_init(&vm->stats_lock); r = amdgpu_vm_init_entities(adev, vm); if (r) @@ -3015,6 +3051,7 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) amdgpu_vm_assert_locked(vm); + spin_lock(&vm->status_lock); seq_puts(m, "\tIdle BOs:\n"); list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) { if (!bo_va->base.bo) @@ -3052,13 +3089,11 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) id = 0; seq_puts(m, "\tInvalidated BOs:\n"); - spin_lock(&vm->invalidated_lock); list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) { if (!bo_va->base.bo) continue; total_invalidated += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } - spin_unlock(&vm->invalidated_lock); total_invalidated_objs = id; id = 0; @@ -3068,6 +3103,7 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m) continue; total_done += amdgpu_bo_print_info(id++, bo_va->base.bo, m); } + spin_unlock(&vm->status_lock); total_done_objs = id; seq_printf(m, "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index f15cef8fa58a..cf0ec94e8a07 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -203,11 +203,11 @@ struct amdgpu_vm_bo_base { /* protected by bo being reserved */ struct amdgpu_vm_bo_base *next; - /* protected by vm reservation and invalidated_lock */ + /* protected by vm status_lock */ struct list_head vm_status; /* if the bo is counted as shared in mem stats - * protected by vm BO being reserved */ + * protected by vm status_lock */ bool shared; /* protected by the BO being reserved */ @@ -343,8 +343,10 @@ struct amdgpu_vm { bool evicting; unsigned int saved_flags; - /* Memory statistics for this vm, protected by stats_lock */ - spinlock_t stats_lock; + /* Lock to protect vm_bo add/del/move on all lists of vm */ + spinlock_t status_lock; + + /* Memory statistics for this vm, protected by status_lock */ struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]; /* @@ -352,8 +354,6 @@ struct amdgpu_vm { * PDs, PTs or per VM BOs. The state transits are: * * evicted -> relocated (PDs, PTs) or moved (per VM BOs) -> idle - * - * Lists are protected by the root PD dma_resv lock. */ /* Per-VM and PT BOs who needs a validation */ @@ -374,10 +374,7 @@ struct amdgpu_vm { * state transits are: * * evicted_user or invalidated -> done - * - * Lists are protected by the invalidated_lock. */ - spinlock_t invalidated_lock; /* BOs for user mode queues that need a validation */ struct list_head evicted_user; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 7a4c12ff9b18..f794fb1cc06e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -543,7 +543,9 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base *entry) entry->bo->vm_bo = NULL; ttm_bo_set_bulk_move(&entry->bo->tbo, NULL); + spin_lock(&entry->vm->status_lock); list_del(&entry->vm_status); + spin_unlock(&entry->vm->status_lock); amdgpu_bo_unref(&entry->bo); } @@ -587,6 +589,7 @@ static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params, struct amdgpu_vm_pt_cursor seek; struct amdgpu_vm_bo_base *entry; + spin_lock(¶ms->vm->status_lock); for_each_amdgpu_vm_pt_dfs_safe(params->adev, params->vm, cursor, seek, entry) { if (entry && entry->bo) list_move(&entry->vm_status, ¶ms->tlb_flush_waitlist); @@ -594,6 +597,7 @@ static void amdgpu_vm_pt_add_list(struct amdgpu_vm_update_params *params, /* enter start node now */ list_move(&cursor->entry->vm_status, ¶ms->tlb_flush_waitlist); + spin_unlock(¶ms->vm->status_lock); } /** -- cgit v1.2.3