diff options
| -rw-r--r-- | drivers/gpu/drm/i915/i915_request.c | 59 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/intel_color.c | 6 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/intel_context.c | 1 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/intel_context_types.h | 3 | ||||
| -rw-r--r-- | drivers/gpu/drm/i915/intel_workarounds.c | 7 | 
5 files changed, 68 insertions, 8 deletions
| diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index b836721d3b13..ce342f7f7ddb 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -425,6 +425,26 @@ void __i915_request_submit(struct i915_request *request)  	if (i915_gem_context_is_banned(request->gem_context))  		i915_request_skip(request, -EIO); +	/* +	 * Are we using semaphores when the gpu is already saturated? +	 * +	 * Using semaphores incurs a cost in having the GPU poll a +	 * memory location, busywaiting for it to change. The continual +	 * memory reads can have a noticeable impact on the rest of the +	 * system with the extra bus traffic, stalling the cpu as it too +	 * tries to access memory across the bus (perf stat -e bus-cycles). +	 * +	 * If we installed a semaphore on this request and we only submit +	 * the request after the signaler completed, that indicates the +	 * system is overloaded and using semaphores at this time only +	 * increases the amount of work we are doing. If so, we disable +	 * further use of semaphores until we are idle again, whence we +	 * optimistically try again. +	 */ +	if (request->sched.semaphores && +	    i915_sw_fence_signaled(&request->semaphore)) +		request->hw_context->saturated |= request->sched.semaphores; +  	/* We may be recursing from the signal callback of another i915 fence */  	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); @@ -799,6 +819,39 @@ err_unreserve:  }  static int +i915_request_await_start(struct i915_request *rq, struct i915_request *signal) +{ +	if (list_is_first(&signal->ring_link, &signal->ring->request_list)) +		return 0; + +	signal = list_prev_entry(signal, ring_link); +	if (i915_timeline_sync_is_later(rq->timeline, &signal->fence)) +		return 0; + +	return i915_sw_fence_await_dma_fence(&rq->submit, +					     &signal->fence, 0, +					     I915_FENCE_GFP); +} + +static intel_engine_mask_t +already_busywaiting(struct i915_request *rq) +{ +	/* +	 * Polling a semaphore causes bus traffic, delaying other users of +	 * both the GPU and CPU. We want to limit the impact on others, +	 * while taking advantage of early submission to reduce GPU +	 * latency. Therefore we restrict ourselves to not using more +	 * than one semaphore from each source, and not using a semaphore +	 * if we have detected the engine is saturated (i.e. would not be +	 * submitted early and cause bus traffic reading an already passed +	 * semaphore). +	 * +	 * See the are-we-too-late? check in __i915_request_submit(). +	 */ +	return rq->sched.semaphores | rq->hw_context->saturated; +} + +static int  emit_semaphore_wait(struct i915_request *to,  		    struct i915_request *from,  		    gfp_t gfp) @@ -811,11 +864,15 @@ emit_semaphore_wait(struct i915_request *to,  	GEM_BUG_ON(INTEL_GEN(to->i915) < 8);  	/* Just emit the first semaphore we see as request space is limited. */ -	if (to->sched.semaphores & from->engine->mask) +	if (already_busywaiting(to) & from->engine->mask)  		return i915_sw_fence_await_dma_fence(&to->submit,  						     &from->fence, 0,  						     I915_FENCE_GFP); +	err = i915_request_await_start(to, from); +	if (err < 0) +		return err; +  	err = i915_sw_fence_await_dma_fence(&to->semaphore,  					    &from->fence, 0,  					    I915_FENCE_GFP); diff --git a/drivers/gpu/drm/i915/intel_color.c b/drivers/gpu/drm/i915/intel_color.c index ca341a9e47e6..9093daabc290 100644 --- a/drivers/gpu/drm/i915/intel_color.c +++ b/drivers/gpu/drm/i915/intel_color.c @@ -173,13 +173,13 @@ static void icl_update_output_csc(struct intel_crtc *crtc,  	I915_WRITE(PIPE_CSC_OUTPUT_PREOFF_LO(pipe), preoff[2]);  	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_RY_GY(pipe), coeff[0] << 16 | coeff[1]); -	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BY(pipe), coeff[2]); +	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BY(pipe), coeff[2] << 16);  	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_RU_GU(pipe), coeff[3] << 16 | coeff[4]); -	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BU(pipe), coeff[5]); +	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BU(pipe), coeff[5] << 16);  	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_RV_GV(pipe), coeff[6] << 16 | coeff[7]); -	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BV(pipe), coeff[8]); +	I915_WRITE(PIPE_CSC_OUTPUT_COEFF_BV(pipe), coeff[8] << 16);  	I915_WRITE(PIPE_CSC_OUTPUT_POSTOFF_HI(pipe), postoff[0]);  	I915_WRITE(PIPE_CSC_OUTPUT_POSTOFF_ME(pipe), postoff[1]); diff --git a/drivers/gpu/drm/i915/intel_context.c b/drivers/gpu/drm/i915/intel_context.c index 8931e0fee873..924cc556223a 100644 --- a/drivers/gpu/drm/i915/intel_context.c +++ b/drivers/gpu/drm/i915/intel_context.c @@ -230,6 +230,7 @@ intel_context_init(struct intel_context *ce,  	ce->gem_context = ctx;  	ce->engine = engine;  	ce->ops = engine->cops; +	ce->saturated = 0;  	INIT_LIST_HEAD(&ce->signal_link);  	INIT_LIST_HEAD(&ce->signals); diff --git a/drivers/gpu/drm/i915/intel_context_types.h b/drivers/gpu/drm/i915/intel_context_types.h index 68b4ca1611e0..339c7437fe82 100644 --- a/drivers/gpu/drm/i915/intel_context_types.h +++ b/drivers/gpu/drm/i915/intel_context_types.h @@ -14,6 +14,7 @@  #include <linux/types.h>  #include "i915_active_types.h" +#include "intel_engine_types.h"  struct i915_gem_context;  struct i915_vma; @@ -58,6 +59,8 @@ struct intel_context {  	atomic_t pin_count;  	struct mutex pin_mutex; /* guards pinning and associated on-gpuing */ +	intel_engine_mask_t saturated; /* submitting semaphores too late? */ +  	/**  	 * active_tracker: Active tracker for the external rq activity  	 * on this intel_context object. diff --git a/drivers/gpu/drm/i915/intel_workarounds.c b/drivers/gpu/drm/i915/intel_workarounds.c index ccaf63679435..9682dd575152 100644 --- a/drivers/gpu/drm/i915/intel_workarounds.c +++ b/drivers/gpu/drm/i915/intel_workarounds.c @@ -541,10 +541,6 @@ static void icl_ctx_workarounds_init(struct intel_engine_cs *engine)  		WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2,  				  GEN11_TDL_CLOCK_GATING_FIX_DISABLE); -	/* WaEnableStateCacheRedirectToCS:icl */ -	WA_SET_BIT_MASKED(GEN9_SLICE_COMMON_ECO_CHICKEN1, -			  GEN11_STATE_CACHE_REDIRECT_TO_CS); -  	/* Wa_2006665173:icl (pre-prod) */  	if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0))  		WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, @@ -1050,6 +1046,9 @@ static void icl_whitelist_build(struct i915_wa_list *w)  	/* WaAllowUMDToModifySamplerMode:icl */  	whitelist_reg(w, GEN10_SAMPLER_MODE); + +	/* WaEnableStateCacheRedirectToCS:icl */ +	whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1);  }  void intel_engine_init_whitelist(struct intel_engine_cs *engine) | 
