diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 220 |
1 files changed, 128 insertions, 92 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 5ca905b4a0fb..0311d799a010 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -28,6 +28,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/sync_file.h> +#include <linux/dma-buf.h> #include <drm/amdgpu_drm.h> #include <drm/drm_syncobj.h> @@ -56,7 +57,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p, /* One for TTM and one for the CS job */ p->uf_entry.tv.num_shared = 2; - drm_gem_object_put_unlocked(gobj); + drm_gem_object_put(gobj); size = amdgpu_bo_size(bo); if (size != PAGE_SIZE || (data->offset + 8) > size) { @@ -97,8 +98,7 @@ static int amdgpu_cs_bo_handles_chunk(struct amdgpu_cs_parser *p, return 0; error_free: - if (info) - kvfree(info); + kvfree(info); return r; } @@ -117,7 +117,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs if (cs->in.num_chunks == 0) return 0; - chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL); + chunk_array = kvmalloc_array(cs->in.num_chunks, sizeof(uint64_t), GFP_KERNEL); if (!chunk_array) return -ENOMEM; @@ -144,7 +144,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs } p->nchunks = cs->in.num_chunks; - p->chunks = kmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk), + p->chunks = kvmalloc_array(p->nchunks, sizeof(struct amdgpu_cs_chunk), GFP_KERNEL); if (!p->chunks) { ret = -ENOMEM; @@ -238,7 +238,7 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs if (p->uf_entry.tv.bo) p->job->uf_addr = uf_offset; - kfree(chunk_array); + kvfree(chunk_array); /* Use this opportunity to fill in task info for the vm */ amdgpu_vm_set_task_info(vm); @@ -250,11 +250,11 @@ free_all_kdata: free_partial_kdata: for (; i >= 0; i--) kvfree(p->chunks[i].kdata); - kfree(p->chunks); + kvfree(p->chunks); p->chunks = NULL; p->nchunks = 0; free_chunk: - kfree(chunk_array); + kvfree(chunk_array); return ret; } @@ -298,7 +298,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev, { s64 time_us, increment_us; u64 free_vram, total_vram, used_vram; - + struct ttm_resource_manager *vram_man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM); /* Allow a maximum of 200 accumulated ms. This is basically per-IB * throttling. * @@ -315,7 +315,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev, } total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size); - used_vram = amdgpu_vram_mgr_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); + used_vram = amdgpu_vram_mgr_usage(vram_man); free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram; spin_lock(&adev->mm_stats.lock); @@ -325,7 +325,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev, increment_us = time_us - adev->mm_stats.last_update_us; adev->mm_stats.last_update_us = time_us; adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us, - us_upper_bound); + us_upper_bound); /* This prevents the short period of low performance when the VRAM * usage is low and the driver is in debt or doesn't have enough @@ -362,7 +362,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev, if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) { u64 total_vis_vram = adev->gmc.visible_vram_size; u64 used_vis_vram = - amdgpu_vram_mgr_vis_usage(&adev->mman.bdev.man[TTM_PL_VRAM]); + amdgpu_vram_mgr_vis_usage(vram_man); if (used_vis_vram < total_vis_vram) { u64 free_vis_vram = total_vis_vram - used_vis_vram; @@ -396,26 +396,27 @@ void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, u64 num_bytes, spin_unlock(&adev->mm_stats.lock); } -static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, - struct amdgpu_bo *bo) +static int amdgpu_cs_bo_validate(void *param, struct amdgpu_bo *bo) { struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); + struct amdgpu_cs_parser *p = param; struct ttm_operation_ctx ctx = { .interruptible = true, .no_wait_gpu = false, - .resv = bo->tbo.base.resv, - .flags = 0 + .resv = bo->tbo.base.resv }; uint32_t domain; int r; - if (bo->pin_count) + if (bo->tbo.pin_count) return 0; /* Don't move this buffer if we have depleted our allowance * to move it. Don't move anything if the threshold is zero. */ - if (p->bytes_moved < p->bytes_moved_threshold) { + if (p->bytes_moved < p->bytes_moved_threshold && + (!bo->tbo.base.dma_buf || + list_empty(&bo->tbo.base.dma_buf->attachments))) { if (!amdgpu_gmc_vram_full_visible(&adev->gmc) && (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED)) { /* And don't move a CPU_ACCESS_REQUIRED BO to limited @@ -450,21 +451,6 @@ retry: return r; } -static int amdgpu_cs_validate(void *param, struct amdgpu_bo *bo) -{ - struct amdgpu_cs_parser *p = param; - int r; - - r = amdgpu_cs_bo_validate(p, bo); - if (r) - return r; - - if (bo->shadow) - r = amdgpu_cs_bo_validate(p, bo->shadow); - - return r; -} - static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, struct list_head *validated) { @@ -492,7 +478,7 @@ static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, lobj->user_pages); } - r = amdgpu_cs_validate(p, bo); + r = amdgpu_cs_bo_validate(p, bo); if (r) return r; @@ -558,7 +544,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); if (!e->user_pages) { - DRM_ERROR("calloc failure\n"); + DRM_ERROR("kvmalloc_array failure\n"); return -ENOMEM; } @@ -586,13 +572,27 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, goto out; } + amdgpu_bo_list_for_each_entry(e, p->bo_list) { + struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); + + e->bo_va = amdgpu_vm_bo_find(vm, bo); + + if (bo->tbo.base.dma_buf && !amdgpu_bo_explicit_sync(bo)) { + e->chain = dma_fence_chain_alloc(); + if (!e->chain) { + r = -ENOMEM; + goto error_validate; + } + } + } + amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, &p->bytes_moved_vis_threshold); p->bytes_moved = 0; p->bytes_moved_vis = 0; r = amdgpu_vm_validate_pt_bos(p->adev, &fpriv->vm, - amdgpu_cs_validate, p); + amdgpu_cs_bo_validate, p); if (r) { DRM_ERROR("amdgpu_vm_validate_pt_bos() failed.\n"); goto error_validate; @@ -613,15 +613,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, gws = p->bo_list->gws_obj; oa = p->bo_list->oa_obj; - amdgpu_bo_list_for_each_entry(e, p->bo_list) { - struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); - - /* Make sure we use the exclusive slot for shared BOs */ - if (bo->prime_shared_count) - e->tv.num_shared = 0; - e->bo_va = amdgpu_vm_bo_find(vm, bo); - } - if (gds) { p->job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; p->job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; @@ -643,24 +634,32 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, } error_validate: - if (r) + if (r) { + amdgpu_bo_list_for_each_entry(e, p->bo_list) { + dma_fence_chain_free(e->chain); + e->chain = NULL; + } ttm_eu_backoff_reservation(&p->ticket, &p->validated); + } out: return r; } static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) { + struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct amdgpu_bo_list_entry *e; int r; list_for_each_entry(e, &p->validated, tv.head) { struct amdgpu_bo *bo = ttm_to_amdgpu_bo(e->tv.bo); struct dma_resv *resv = bo->tbo.base.resv; + enum amdgpu_sync_mode sync_mode; - r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, p->filp, - amdgpu_bo_explicit_sync(bo)); - + sync_mode = amdgpu_bo_explicit_sync(bo) ? + AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; + r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode, + &fpriv->vm); if (r) return r; } @@ -668,11 +667,12 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) } /** - * cs_parser_fini() - clean parser states + * amdgpu_cs_parser_fini() - clean parser states * @parser: parser structure holding parsing context. * @error: error number + * @backoff: indicator to backoff the reservation * - * If error is set than unvalidate buffer, otherwise just free memory + * If error is set then unvalidate buffer, otherwise just free memory * used by parsing context. **/ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, @@ -680,9 +680,17 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, { unsigned i; - if (error && backoff) + if (error && backoff) { + struct amdgpu_bo_list_entry *e; + + amdgpu_bo_list_for_each_entry(e, parser->bo_list) { + dma_fence_chain_free(e->chain); + e->chain = NULL; + } + ttm_eu_backoff_reservation(&parser->ticket, &parser->validated); + } for (i = 0; i < parser->num_post_deps; i++) { drm_syncobj_put(parser->post_deps[i].syncobj); @@ -701,7 +709,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error, for (i = 0; i < parser->nchunks; i++) kvfree(parser->chunks[i].kdata); - kfree(parser->chunks); + kvfree(parser->chunks); if (parser->job) amdgpu_job_free(parser->job); if (parser->uf_entry.tv.bo) { @@ -791,33 +799,27 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false); + r = amdgpu_vm_bo_update(adev, fpriv->prt_va, false, NULL); if (r) return r; - r = amdgpu_sync_fence(adev, &p->job->sync, - fpriv->prt_va->last_pt_update, false); + r = amdgpu_sync_vm_fence(&p->job->sync, fpriv->prt_va->last_pt_update); if (r) return r; if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) { - struct dma_fence *f; - bo_va = fpriv->csa_va; BUG_ON(!bo_va); - r = amdgpu_vm_bo_update(adev, bo_va, false); + r = amdgpu_vm_bo_update(adev, bo_va, false, NULL); if (r) return r; - f = bo_va->last_pt_update; - r = amdgpu_sync_fence(adev, &p->job->sync, f, false); + r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update); if (r) return r; } amdgpu_bo_list_for_each_entry(e, p->bo_list) { - struct dma_fence *f; - /* ignore duplicates */ bo = ttm_to_amdgpu_bo(e->tv.bo); if (!bo) @@ -827,12 +829,11 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (bo_va == NULL) continue; - r = amdgpu_vm_bo_update(adev, bo_va, false); + r = amdgpu_vm_bo_update(adev, bo_va, false, NULL); if (r) return r; - f = bo_va->last_pt_update; - r = amdgpu_sync_fence(adev, &p->job->sync, f, false); + r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update); if (r) return r; } @@ -845,11 +846,11 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(adev, &p->job->sync, vm->last_update, false); + r = amdgpu_sync_vm_fence(&p->job->sync, vm->last_update); if (r) return r; - p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.base.bo); + p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); if (amdgpu_vm_debug) { /* Invalidate all BOs to test for userspace bugs */ @@ -916,11 +917,17 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev, if (parser->entity && parser->entity != entity) return -EINVAL; + /* Return if there is no run queue associated with this entity. + * Possibly because of disabled HW IP*/ + if (entity->rq == NULL) + return -EINVAL; + parser->entity = entity; ring = to_amdgpu_ring(entity->rq->sched); r = amdgpu_ib_get(adev, vm, ring->funcs->parse_cs ? - chunk_ib->ib_bytes : 0, ib); + chunk_ib->ib_bytes : 0, + AMDGPU_IB_POOL_DELAYED, ib); if (r) { DRM_ERROR("Failed to get ib !\n"); return r; @@ -987,7 +994,7 @@ static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p, dma_fence_put(old); } - r = amdgpu_sync_fence(p->adev, &p->job->sync, fence, true); + r = amdgpu_sync_fence(&p->job->sync, fence); dma_fence_put(fence); if (r) return r; @@ -1009,7 +1016,7 @@ static int amdgpu_syncobj_lookup_and_add_to_sync(struct amdgpu_cs_parser *p, return r; } - r = amdgpu_sync_fence(p->adev, &p->job->sync, fence, true); + r = amdgpu_sync_fence(&p->job->sync, fence); dma_fence_put(fence); return r; @@ -1120,7 +1127,7 @@ static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p dep->chain = NULL; if (syncobj_deps[i].point) { - dep->chain = kmalloc(sizeof(*dep->chain), GFP_KERNEL); + dep->chain = dma_fence_chain_alloc(); if (!dep->chain) return -ENOMEM; } @@ -1128,7 +1135,7 @@ static int amdgpu_cs_process_syncobj_timeline_out_dep(struct amdgpu_cs_parser *p dep->syncobj = drm_syncobj_find(p->filp, syncobj_deps[i].handle); if (!dep->syncobj) { - kfree(dep->chain); + dma_fence_chain_free(dep->chain); return -EINVAL; } dep->point = syncobj_deps[i].point; @@ -1203,8 +1210,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; struct drm_sched_entity *entity = p->entity; - enum drm_sched_priority priority; - struct amdgpu_ring *ring; struct amdgpu_bo_list_entry *e; struct amdgpu_job *job; uint64_t seq; @@ -1213,10 +1218,12 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, job = p->job; p->job = NULL; - r = drm_sched_job_init(&job->base, entity, p->filp); + r = drm_sched_job_init(&job->base, entity, &fpriv->vm); if (r) goto error_unlock; + drm_sched_job_arm(&job->base); + /* No memory allocation is allowed while holding the notifier lock. * The lock is held until amdgpu_cs_submit is finished and fence is * added to BOs. @@ -1236,7 +1243,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, goto error_abort; } - job->owner = p->filp; p->fence = dma_fence_get(&job->base.s_fence->finished); amdgpu_ctx_add_fence(p->ctx, entity, p->fence, &seq); @@ -1255,14 +1261,32 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, trace_amdgpu_cs_ioctl(job); amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket); - priority = job->base.s_priority; - drm_sched_entity_push_job(&job->base, entity); - - ring = to_amdgpu_ring(entity->rq->sched); - amdgpu_ring_priority_get(ring, priority); + drm_sched_entity_push_job(&job->base); amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm); + amdgpu_bo_list_for_each_entry(e, p->bo_list) { + struct dma_resv *resv = e->tv.bo->base.resv; + struct dma_fence_chain *chain = e->chain; + + if (!chain) + continue; + + /* + * Work around dma_resv shortcommings by wrapping up the + * submission in a dma_fence_chain and add it as exclusive + * fence, but first add the submission as shared fence to make + * sure that shared fences never signal before the exclusive + * one. + */ + dma_fence_chain_init(chain, dma_resv_excl_fence(resv), + dma_fence_get(p->fence), 1); + + dma_resv_add_shared_fence(resv, p->fence); + rcu_assign_pointer(resv->fence_excl, &chain->base); + e->chain = NULL; + } + ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); mutex_unlock(&p->adev->notifier_lock); @@ -1277,13 +1301,24 @@ error_unlock: return r; } +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) +{ + int i; + + if (!trace_amdgpu_cs_enabled()) + return; + + for (i = 0; i < parser->job->num_ibs; i++) + trace_amdgpu_cs(parser, i); +} + int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_cs *cs = data; struct amdgpu_cs_parser parser = {}; bool reserved_buffers = false; - int i, r; + int r; if (amdgpu_ras_intr_triggered()) return -EHWPOISON; @@ -1296,7 +1331,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) r = amdgpu_cs_parser_init(&parser, data); if (r) { - DRM_ERROR("Failed to initialize parser %d!\n", r); + if (printk_ratelimit()) + DRM_ERROR("Failed to initialize parser %d!\n", r); goto out; } @@ -1321,8 +1357,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) reserved_buffers = true; - for (i = 0; i < parser.job->num_ibs; i++) - trace_amdgpu_cs(&parser, i); + trace_amdgpu_cs_ibs(&parser); r = amdgpu_cs_vm_handling(&parser); if (r) @@ -1423,7 +1458,7 @@ static struct dma_fence *amdgpu_cs_get_fence(struct amdgpu_device *adev, int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_fence_to_handle *info = data; struct dma_fence *fence; struct drm_syncobj *syncobj; @@ -1452,7 +1487,7 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, dma_fence_put(fence); if (r) return r; - r = drm_syncobj_get_fd(syncobj, (int*)&info->out.handle); + r = drm_syncobj_get_fd(syncobj, (int *)&info->out.handle); drm_syncobj_put(syncobj); return r; @@ -1480,7 +1515,7 @@ int amdgpu_cs_fence_to_handle_ioctl(struct drm_device *dev, void *data, } /** - * amdgpu_cs_wait_all_fence - wait on all fences to signal + * amdgpu_cs_wait_all_fences - wait on all fences to signal * * @adev: amdgpu device * @filp: file private @@ -1599,7 +1634,7 @@ err_free_fence_array: int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) { - struct amdgpu_device *adev = dev->dev_private; + struct amdgpu_device *adev = drm_to_adev(dev); union drm_amdgpu_wait_fences *wait = data; uint32_t fence_count = wait->in.fence_count; struct drm_amdgpu_fence *fences_user; @@ -1631,11 +1666,12 @@ err_free_fences: } /** - * amdgpu_cs_find_bo_va - find bo_va for VM address + * amdgpu_cs_find_mapping - find bo_va for VM address * * @parser: command submission parser context * @addr: VM address * @bo: resulting BO of the mapping found + * @map: Placeholder to return found BO mapping * * Search the buffer objects in the command submission context for a certain * virtual memory address. Returns allocation structure when found, NULL |
