diff options
| author | Mark Brown <broonie@kernel.org> | 2016-11-04 21:16:38 +0300 | 
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2016-11-04 21:16:38 +0300 | 
| commit | cc9b94029e9ef51787af908e9856b1eed314bc00 (patch) | |
| tree | 9675310b89d0f6fb1f7bd9423f0638c4ee5226fd /drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | |
| parent | 13bed58ce8748d430a26e353a09b89f9d613a71f (diff) | |
| parent | 1b5b42216469b05ef4b5916cb40b127dfab1da88 (diff) | |
| download | linux-cc9b94029e9ef51787af908e9856b1eed314bc00.tar.xz | |
Merge branch 'topic/error' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/regulator into regulator-fixed
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 377 | 
1 files changed, 282 insertions, 95 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 9bc8f1d99733..b0f6e6957536 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -91,6 +91,7 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,  				      uint32_t *offset)  {  	struct drm_gem_object *gobj; +	unsigned long size;  	gobj = drm_gem_object_lookup(p->filp, data->handle);  	if (gobj == NULL) @@ -101,6 +102,11 @@ static int amdgpu_cs_user_fence_chunk(struct amdgpu_cs_parser *p,  	p->uf_entry.tv.bo = &p->uf_entry.robj->tbo;  	p->uf_entry.tv.shared = true;  	p->uf_entry.user_pages = NULL; + +	size = amdgpu_bo_size(p->uf_entry.robj); +	if (size != PAGE_SIZE || (data->offset + 8) > size) +		return -EINVAL; +  	*offset = data->offset;  	drm_gem_object_unreference_unlocked(gobj); @@ -216,11 +222,8 @@ int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)  	if (ret)  		goto free_all_kdata; -	if (p->uf_entry.robj) { -		p->job->uf_bo = amdgpu_bo_ref(p->uf_entry.robj); -		p->job->uf_offset = uf_offset; -	} - +	if (p->uf_entry.robj) +		p->job->uf_addr = uf_offset;  	kfree(chunk_array);  	return 0; @@ -238,70 +241,212 @@ free_chunk:  	return ret;  } -/* Returns how many bytes TTM can move per IB. +/* Convert microseconds to bytes. */ +static u64 us_to_bytes(struct amdgpu_device *adev, s64 us) +{ +	if (us <= 0 || !adev->mm_stats.log2_max_MBps) +		return 0; + +	/* Since accum_us is incremented by a million per second, just +	 * multiply it by the number of MB/s to get the number of bytes. +	 */ +	return us << adev->mm_stats.log2_max_MBps; +} + +static s64 bytes_to_us(struct amdgpu_device *adev, u64 bytes) +{ +	if (!adev->mm_stats.log2_max_MBps) +		return 0; + +	return bytes >> adev->mm_stats.log2_max_MBps; +} + +/* Returns how many bytes TTM can move right now. If no bytes can be moved, + * it returns 0. If it returns non-zero, it's OK to move at least one buffer, + * which means it can go over the threshold once. If that happens, the driver + * will be in debt and no other buffer migrations can be done until that debt + * is repaid. + * + * This approach allows moving a buffer of any size (it's important to allow + * that). + * + * The currency is simply time in microseconds and it increases as the clock + * ticks. The accumulated microseconds (us) are converted to bytes and + * returned.   */  static u64 amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev)  { -	u64 real_vram_size = adev->mc.real_vram_size; -	u64 vram_usage = atomic64_read(&adev->vram_usage); +	s64 time_us, increment_us; +	u64 max_bytes; +	u64 free_vram, total_vram, used_vram; -	/* This function is based on the current VRAM usage. +	/* Allow a maximum of 200 accumulated ms. This is basically per-IB +	 * throttling.  	 * -	 * - If all of VRAM is free, allow relocating the number of bytes that -	 *   is equal to 1/4 of the size of VRAM for this IB. +	 * It means that in order to get full max MBps, at least 5 IBs per +	 * second must be submitted and not more than 200ms apart from each +	 * other. +	 */ +	const s64 us_upper_bound = 200000; -	 * - If more than one half of VRAM is occupied, only allow relocating -	 *   1 MB of data for this IB. -	 * -	 * - From 0 to one half of used VRAM, the threshold decreases -	 *   linearly. -	 *         __________________ -	 * 1/4 of -|\               | -	 * VRAM    | \              | -	 *         |  \             | -	 *         |   \            | -	 *         |    \           | -	 *         |     \          | -	 *         |      \         | -	 *         |       \________|1 MB -	 *         |----------------| -	 *    VRAM 0 %             100 % -	 *         used            used -	 * -	 * Note: It's a threshold, not a limit. The threshold must be crossed -	 * for buffer relocations to stop, so any buffer of an arbitrary size -	 * can be moved as long as the threshold isn't crossed before -	 * the relocation takes place. We don't want to disable buffer -	 * relocations completely. +	if (!adev->mm_stats.log2_max_MBps) +		return 0; + +	total_vram = adev->mc.real_vram_size - adev->vram_pin_size; +	used_vram = atomic64_read(&adev->vram_usage); +	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram; + +	spin_lock(&adev->mm_stats.lock); + +	/* Increase the amount of accumulated us. */ +	time_us = ktime_to_us(ktime_get()); +	increment_us = time_us - adev->mm_stats.last_update_us; +	adev->mm_stats.last_update_us = time_us; +	adev->mm_stats.accum_us = min(adev->mm_stats.accum_us + increment_us, +                                      us_upper_bound); + +	/* This prevents the short period of low performance when the VRAM +	 * usage is low and the driver is in debt or doesn't have enough +	 * accumulated us to fill VRAM quickly.  	 * -	 * The idea is that buffers should be placed in VRAM at creation time -	 * and TTM should only do a minimum number of relocations during -	 * command submission. In practice, you need to submit at least -	 * a dozen IBs to move all buffers to VRAM if they are in GTT. +	 * The situation can occur in these cases: +	 * - a lot of VRAM is freed by userspace +	 * - the presence of a big buffer causes a lot of evictions +	 *   (solution: split buffers into smaller ones)  	 * -	 * Also, things can get pretty crazy under memory pressure and actual -	 * VRAM usage can change a lot, so playing safe even at 50% does -	 * consistently increase performance. +	 * If 128 MB or 1/8th of VRAM is free, start filling it now by setting +	 * accum_us to a positive number.  	 */ +	if (free_vram >= 128 * 1024 * 1024 || free_vram >= total_vram / 8) { +		s64 min_us; + +		/* Be more aggresive on dGPUs. Try to fill a portion of free +		 * VRAM now. +		 */ +		if (!(adev->flags & AMD_IS_APU)) +			min_us = bytes_to_us(adev, free_vram / 4); +		else +			min_us = 0; /* Reset accum_us on APUs. */ + +		adev->mm_stats.accum_us = max(min_us, adev->mm_stats.accum_us); +	} -	u64 half_vram = real_vram_size >> 1; -	u64 half_free_vram = vram_usage >= half_vram ? 0 : half_vram - vram_usage; -	u64 bytes_moved_threshold = half_free_vram >> 1; -	return max(bytes_moved_threshold, 1024*1024ull); +	/* This returns 0 if the driver is in debt to disallow (optional) +	 * buffer moves. +	 */ +	max_bytes = us_to_bytes(adev, adev->mm_stats.accum_us); + +	spin_unlock(&adev->mm_stats.lock); +	return max_bytes; +} + +/* Report how many bytes have really been moved for the last command + * submission. This can result in a debt that can stop buffer migrations + * temporarily. + */ +static void amdgpu_cs_report_moved_bytes(struct amdgpu_device *adev, +					 u64 num_bytes) +{ +	spin_lock(&adev->mm_stats.lock); +	adev->mm_stats.accum_us -= bytes_to_us(adev, num_bytes); +	spin_unlock(&adev->mm_stats.lock); +} + +static int amdgpu_cs_bo_validate(struct amdgpu_cs_parser *p, +				 struct amdgpu_bo *bo) +{ +	u64 initial_bytes_moved; +	uint32_t domain; +	int r; + +	if (bo->pin_count) +		return 0; + +	/* Don't move this buffer if we have depleted our allowance +	 * to move it. Don't move anything if the threshold is zero. +	 */ +	if (p->bytes_moved < p->bytes_moved_threshold) +		domain = bo->prefered_domains; +	else +		domain = bo->allowed_domains; + +retry: +	amdgpu_ttm_placement_from_domain(bo, domain); +	initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved); +	r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); +	p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) - +		initial_bytes_moved; + +	if (unlikely(r == -ENOMEM) && domain != bo->allowed_domains) { +		domain = bo->allowed_domains; +		goto retry; +	} + +	return r;  } -int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p, +/* Last resort, try to evict something from the current working set */ +static bool amdgpu_cs_try_evict(struct amdgpu_cs_parser *p, +				struct amdgpu_bo_list_entry *lobj) +{ +	uint32_t domain = lobj->robj->allowed_domains; +	int r; + +	if (!p->evictable) +		return false; + +	for (;&p->evictable->tv.head != &p->validated; +	     p->evictable = list_prev_entry(p->evictable, tv.head)) { + +		struct amdgpu_bo_list_entry *candidate = p->evictable; +		struct amdgpu_bo *bo = candidate->robj; +		u64 initial_bytes_moved; +		uint32_t other; + +		/* If we reached our current BO we can forget it */ +		if (candidate == lobj) +			break; + +		other = amdgpu_mem_type_to_domain(bo->tbo.mem.mem_type); + +		/* Check if this BO is in one of the domains we need space for */ +		if (!(other & domain)) +			continue; + +		/* Check if we can move this BO somewhere else */ +		other = bo->allowed_domains & ~domain; +		if (!other) +			continue; + +		/* Good we can try to move this BO somewhere else */ +		amdgpu_ttm_placement_from_domain(bo, other); +		initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved); +		r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); +		p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) - +			initial_bytes_moved; + +		if (unlikely(r)) +			break; + +		p->evictable = list_prev_entry(p->evictable, tv.head); +		list_move(&candidate->tv.head, &p->validated); + +		return true; +	} + +	return false; +} + +static int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,  			    struct list_head *validated)  {  	struct amdgpu_bo_list_entry *lobj; -	u64 initial_bytes_moved;  	int r;  	list_for_each_entry(lobj, validated, tv.head) {  		struct amdgpu_bo *bo = lobj->robj;  		bool binding_userptr = false;  		struct mm_struct *usermm; -		uint32_t domain;  		usermm = amdgpu_ttm_tt_get_usermm(bo->tbo.ttm);  		if (usermm && usermm != current->mm) @@ -316,35 +461,19 @@ int amdgpu_cs_list_validate(struct amdgpu_cs_parser *p,  			binding_userptr = true;  		} -		if (bo->pin_count) -			continue; - -		/* Avoid moving this one if we have moved too many buffers -		 * for this IB already. -		 * -		 * Note that this allows moving at least one buffer of -		 * any size, because it doesn't take the current "bo" -		 * into account. We don't want to disallow buffer moves -		 * completely. -		 */ -		if (p->bytes_moved <= p->bytes_moved_threshold) -			domain = bo->prefered_domains; -		else -			domain = bo->allowed_domains; - -	retry: -		amdgpu_ttm_placement_from_domain(bo, domain); -		initial_bytes_moved = atomic64_read(&bo->adev->num_bytes_moved); -		r = ttm_bo_validate(&bo->tbo, &bo->placement, true, false); -		p->bytes_moved += atomic64_read(&bo->adev->num_bytes_moved) - -			       initial_bytes_moved; +		if (p->evictable == lobj) +			p->evictable = NULL; -		if (unlikely(r)) { -			if (r != -ERESTARTSYS && domain != bo->allowed_domains) { -				domain = bo->allowed_domains; -				goto retry; -			} +		do { +			r = amdgpu_cs_bo_validate(p, bo); +		} while (r == -ENOMEM && amdgpu_cs_try_evict(p, lobj)); +		if (r)  			return r; + +		if (bo->shadow) { +			r = amdgpu_cs_bo_validate(p, bo); +			if (r) +				return r;  		}  		if (binding_userptr) { @@ -389,8 +518,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,  		r = ttm_eu_reserve_buffers(&p->ticket, &p->validated, true,  					   &duplicates); -		if (unlikely(r != 0)) +		if (unlikely(r != 0)) { +			DRM_ERROR("ttm_eu_reserve_buffers failed.\n");  			goto error_free_pages; +		}  		/* Without a BO list we don't have userptr BOs */  		if (!p->bo_list) @@ -430,9 +561,10 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,  		/* Unreserve everything again. */  		ttm_eu_backoff_reservation(&p->ticket, &p->validated); -		/* We tried to often, just abort */ +		/* We tried too many times, just abort */  		if (!--tries) {  			r = -EDEADLK; +			DRM_ERROR("deadlock in %s\n", __func__);  			goto error_free_pages;  		} @@ -444,11 +576,13 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,  							 sizeof(struct page*));  			if (!e->user_pages) {  				r = -ENOMEM; +				DRM_ERROR("calloc failure in %s\n", __func__);  				goto error_free_pages;  			}  			r = amdgpu_ttm_tt_get_user_pages(ttm, e->user_pages);  			if (r) { +				DRM_ERROR("amdgpu_ttm_tt_get_user_pages failed.\n");  				drm_free_large(e->user_pages);  				e->user_pages = NULL;  				goto error_free_pages; @@ -459,18 +593,30 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,  		list_splice(&need_pages, &p->validated);  	} -	amdgpu_vm_get_pt_bos(&fpriv->vm, &duplicates); +	amdgpu_vm_get_pt_bos(p->adev, &fpriv->vm, &duplicates);  	p->bytes_moved_threshold = amdgpu_cs_get_threshold_for_moves(p->adev);  	p->bytes_moved = 0; +	p->evictable = list_last_entry(&p->validated, +				       struct amdgpu_bo_list_entry, +				       tv.head);  	r = amdgpu_cs_list_validate(p, &duplicates); -	if (r) +	if (r) { +		DRM_ERROR("amdgpu_cs_list_validate(duplicates) failed.\n");  		goto error_validate; +	}  	r = amdgpu_cs_list_validate(p, &p->validated); -	if (r) +	if (r) { +		DRM_ERROR("amdgpu_cs_list_validate(validated) failed.\n");  		goto error_validate; +	} + +	amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved); + +	fpriv->vm.last_eviction_counter = +		atomic64_read(&p->adev->num_evictions);  	if (p->bo_list) {  		struct amdgpu_bo *gds = p->bo_list->gds_obj; @@ -499,6 +645,13 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,  		}  	} +	if (!r && p->uf_entry.robj) { +		struct amdgpu_bo *uf = p->uf_entry.robj; + +		r = amdgpu_ttm_bind(&uf->tbo, &uf->tbo.mem); +		p->job->uf_addr += amdgpu_bo_gpu_offset(uf); +	} +  error_validate:  	if (r) {  		amdgpu_vm_move_pt_bos_in_lru(p->adev, &fpriv->vm); @@ -614,7 +767,7 @@ static int amdgpu_bo_vm_update_pte(struct amdgpu_cs_parser *p,  			if (bo_va == NULL)  				continue; -			r = amdgpu_vm_bo_update(adev, bo_va, &bo->tbo.mem); +			r = amdgpu_vm_bo_update(adev, bo_va, false);  			if (r)  				return r; @@ -653,18 +806,21 @@ static int amdgpu_cs_ib_vm_chunk(struct amdgpu_device *adev,  	/* Only for UVD/VCE VM emulation */  	if (ring->funcs->parse_cs) { +		p->job->vm = NULL;  		for (i = 0; i < p->job->num_ibs; i++) {  			r = amdgpu_ring_parse_cs(ring, p, i);  			if (r)  				return r;  		} -	} +	} else { +		p->job->vm_pd_addr = amdgpu_bo_gpu_offset(vm->page_directory); -	r = amdgpu_bo_vm_update_pte(p, vm); -	if (!r) -		amdgpu_cs_sync_rings(p); +		r = amdgpu_bo_vm_update_pte(p, vm); +		if (r) +			return r; +	} -	return r; +	return amdgpu_cs_sync_rings(p);  }  static int amdgpu_cs_handle_lockup(struct amdgpu_device *adev, int r) @@ -704,6 +860,14 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,  		if (r)  			return r; +		if (ib->flags & AMDGPU_IB_FLAG_PREAMBLE) { +			parser->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; +			if (!parser->ctx->preamble_presented) { +				parser->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; +				parser->ctx->preamble_presented = true; +			} +		} +  		if (parser->job->ring && parser->job->ring != ring)  			return -EINVAL; @@ -761,7 +925,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,  	}  	/* UVD & VCE fw doesn't support user fences */ -	if (parser->job->uf_bo && ( +	if (parser->job->uf_addr && (  	    parser->job->ring->type == AMDGPU_RING_TYPE_UVD ||  	    parser->job->ring->type == AMDGPU_RING_TYPE_VCE))  		return -EINVAL; @@ -830,27 +994,24 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,  {  	struct amdgpu_ring *ring = p->job->ring;  	struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity; -	struct fence *fence;  	struct amdgpu_job *job;  	int r;  	job = p->job;  	p->job = NULL; -	r = amd_sched_job_init(&job->base, &ring->sched, -			       entity, amdgpu_job_timeout_func, -			       amdgpu_job_free_func, -			       p->filp, &fence); +	r = amd_sched_job_init(&job->base, &ring->sched, entity, p->filp);  	if (r) {  		amdgpu_job_free(job);  		return r;  	}  	job->owner = p->filp; -	job->ctx = entity->fence_context; -	p->fence = fence_get(fence); -	cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, fence); +	job->fence_ctx = entity->fence_context; +	p->fence = fence_get(&job->base.s_fence->finished); +	cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence);  	job->uf_sequence = cs->out.handle; +	amdgpu_job_free_resources(job);  	trace_amdgpu_cs_ioctl(job);  	amd_sched_entity_push_job(&job->base); @@ -1012,3 +1173,29 @@ amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,  	return NULL;  } + +/** + * amdgpu_cs_sysvm_access_required - make BOs accessible by the system VM + * + * @parser: command submission parser context + * + * Helper for UVD/VCE VM emulation, make sure BOs are accessible by the system VM. + */ +int amdgpu_cs_sysvm_access_required(struct amdgpu_cs_parser *parser) +{ +	unsigned i; +	int r; + +	if (!parser->bo_list) +		return 0; + +	for (i = 0; i < parser->bo_list->num_entries; i++) { +		struct amdgpu_bo *bo = parser->bo_list->array[i].robj; + +		r = amdgpu_ttm_bind(&bo->tbo, &bo->tbo.mem); +		if (unlikely(r)) +			return r; +	} + +	return 0; +}  | 
