From 7a66ad6c087ee3863cc9a8d696ac2191d1c2e904 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 9 May 2023 17:42:11 +0800 Subject: drm/amdgpu: set finished fence error if job timedout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set finished fence to ETIME error if job timedout. Signed-off-by: ZhenGuo Yin Acked-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index c3d9d75143f4..aca3a2bfe8d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -65,6 +65,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) DRM_ERROR("Process information: process %s pid %d thread %s pid %d\n", ti.process_name, ti.tgid, ti.task_name, ti.pid); + dma_fence_set_error(&s_job->s_fence->finished, -ETIME); + if (amdgpu_device_should_recover_gpu(ring->adev)) { struct amdgpu_reset_context reset_context; memset(&reset_context, 0, sizeof(reset_context)); -- cgit v1.2.3 From e84e697d92d9d84ca13b4440cea36abe9a2fe079 Mon Sep 17 00:00:00 2001 From: Christian König Date: Mon, 17 Apr 2023 18:15:15 +0200 Subject: drm/amdgpu: abort submissions during prepare on error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forward errors from previous submissions to this one. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index aca3a2bfe8d2..9e6f2fa8e2fe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -258,16 +258,27 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job, struct dma_fence *fence = NULL; int r; + /* Ignore soft recovered fences here */ + r = drm_sched_entity_error(s_entity); + if (r && r != -ENODATA) + goto error; + if (!fence && job->gang_submit) fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit); while (!fence && job->vm && !job->vmid) { r = amdgpu_vmid_grab(job->vm, ring, job, &fence); - if (r) + if (r) { DRM_ERROR("Error getting VM ID (%d)\n", r); + goto error; + } } return fence; + +error: + dma_fence_set_error(&job->base.s_fence->finished, r); + return NULL; } static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) -- cgit v1.2.3 From f88e295e9094deee93066f32a4380307e8cb3dd9 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 19 Apr 2023 15:17:57 +0200 Subject: drm/amdgpu: add VM generation token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of using the VRAM lost counter add a 64bit token which indicates if a context or job is still valid to use. Should the VRAM be lost or the page tables need re-creation the token will change indicating that userspace needs to act and re-create the contexts and re-submit the work. Signed-off-by: Christian König Reviewed-by: Luben Tuikov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 26 ++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 4 ++++ 7 files changed, 37 insertions(+), 7 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_job.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 6e1d331af01f..d9503882ea97 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -309,7 +309,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, } p->gang_leader = p->jobs[p->gang_leader_idx]; - if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) { + if (p->ctx->generation != p->gang_leader->generation) { ret = -ECANCELED; goto free_all_kdata; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c index 410acdd4554c..3ccd709ae76a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c @@ -333,7 +333,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority, ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter); ctx->reset_counter_query = ctx->reset_counter; - ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter); + ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm); ctx->init_priority = priority; ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET; @@ -586,7 +586,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev, if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET; - if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) + if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm)) out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST; if (atomic_read(&ctx->guilty)) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h index f1e27b6e16f4..85376baaa92f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h @@ -47,7 +47,7 @@ struct amdgpu_ctx { struct amdgpu_ctx_mgr *mgr; unsigned reset_counter; unsigned reset_counter_query; - uint32_t vram_lost_counter; + uint64_t generation; spinlock_t ring_lock; struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM]; bool preamble_presented; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 9e6f2fa8e2fe..78476bc75b4e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -109,7 +109,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, (*job)->vm = vm; amdgpu_sync_create(&(*job)->explicit_sync); - (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); + (*job)->generation = amdgpu_vm_generation(adev, vm); (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; if (!entity) @@ -295,7 +295,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) trace_amdgpu_sched_run_job(job); /* Skip job if VRAM is lost and never resubmit gangs */ - if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) || + if (job->generation != amdgpu_vm_generation(adev, job->vm) || (job->job_run_counter && job->gang_submit)) dma_fence_set_error(finished, -ECANCELED); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 3f9804f956c9..a963a25ddd62 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -61,7 +61,7 @@ struct amdgpu_job { uint32_t gds_base, gds_size; uint32_t gws_base, gws_size; uint32_t oa_base, oa_size; - uint32_t vram_lost_counter; + uint64_t generation; /* user fence handling */ uint64_t uf_addr; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 1045be4b547c..143d11afe0e5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -405,6 +405,30 @@ static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm) drm_sched_entity_destroy(&vm->delayed); } +/** + * amdgpu_vm_generation - return the page table re-generation counter + * @adev: the amdgpu_device + * @vm: optional VM to check, might be NULL + * + * Returns a page table re-generation token to allow checking if submissions + * are still valid to use this VM. The VM parameter might be NULL in which case + * just the VRAM lost counter will be used. + */ +uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm) +{ + uint64_t result = (u64)atomic_read(&adev->vram_lost_counter) << 32; + + if (!vm) + return result; + + result += vm->generation; + /* Add one if the page tables will be re-generated on next CS */ + if (drm_sched_entity_error(&vm->delayed)) + ++result; + + return result; +} + /** * amdgpu_vm_validate_pt_bos - validate the page table BOs * @@ -428,6 +452,7 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, int r; if (drm_sched_entity_error(&vm->delayed)) { + ++vm->generation; amdgpu_vm_bo_reset_state_machine(vm); amdgpu_vm_fini_entities(vm); r = amdgpu_vm_init_entities(adev, vm); @@ -2134,6 +2159,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) vm->last_update = dma_fence_get_stub(); vm->last_unlocked = dma_fence_get_stub(); vm->last_tlb_flush = dma_fence_get_stub(); + vm->generation = 0; mutex_init(&vm->eviction_lock); vm->evicting = false; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index 14f9a2bf3acb..9c85d494f2a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -295,6 +295,9 @@ struct amdgpu_vm { atomic64_t tlb_seq; struct dma_fence *last_tlb_flush; + /* How many times we had to re-generate the page tables */ + uint64_t generation; + /* Last unlocked submission to the scheduler entities */ struct dma_fence *last_unlocked; @@ -397,6 +400,7 @@ void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm, struct list_head *validated, struct amdgpu_bo_list_entry *entry); bool amdgpu_vm_ready(struct amdgpu_vm *vm); +uint64_t amdgpu_vm_generation(struct amdgpu_device *adev, struct amdgpu_vm *vm); int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, int (*callback)(void *p, struct amdgpu_bo *bo), void *param); -- cgit v1.2.3