8 files changed, 136 insertions, 41 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 68f9c7c7bd6c..39ca3cb6af66 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -238,6 +238,9 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 	rbtree_postorder_for_each_entry_safe(it, n, &ctx->hw_contexts, node)
 		intel_context_put(it);
 
+	if (ctx->timeline)
+		i915_timeline_put(ctx->timeline);
+
 	kfree(ctx->name);
 	put_pid(ctx->pid);
 
@@ -403,12 +406,16 @@ static void __assign_ppgtt(struct i915_gem_context *ctx,
 }
 
 static struct i915_gem_context *
-i915_gem_create_context(struct drm_i915_private *dev_priv)
+i915_gem_create_context(struct drm_i915_private *dev_priv, unsigned int flags)
 {
 	struct i915_gem_context *ctx;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
+	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE &&
+	    !HAS_EXECLISTS(dev_priv))
+		return ERR_PTR(-EINVAL);
+
 	/* Reap the most stale context */
 	contexts_free_first(dev_priv);
 
@@ -431,6 +438,18 @@ i915_gem_create_context(struct drm_i915_private *dev_priv)
 		i915_ppgtt_put(ppgtt);
 	}
 
+	if (flags & I915_CONTEXT_CREATE_FLAGS_SINGLE_TIMELINE) {
+		struct i915_timeline *timeline;
+
+		timeline = i915_timeline_create(dev_priv, NULL);
+		if (IS_ERR(timeline)) {
+			context_close(ctx);
+			return ERR_CAST(timeline);
+		}
+
+		ctx->timeline = timeline;
+	}
+
 	trace_i915_context_create(ctx);
 
 	return ctx;
@@ -459,7 +478,7 @@ i915_gem_context_create_gvt(struct drm_device *dev)
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = i915_gem_create_context(to_i915(dev));
+	ctx = i915_gem_create_context(to_i915(dev), 0);
 	if (IS_ERR(ctx))
 		goto out;
 
@@ -495,7 +514,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 	struct i915_gem_context *ctx;
 	int err;
 
-	ctx = i915_gem_create_context(i915);
+	ctx = i915_gem_create_context(i915, 0);
 	if (IS_ERR(ctx))
 		return ctx;
 
@@ -658,7 +677,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
 	idr_init_base(&file_priv->vm_idr, 1);
 
 	mutex_lock(&i915->drm.struct_mutex);
-	ctx = i915_gem_create_context(i915);
+	ctx = i915_gem_create_context(i915, 0);
 	mutex_unlock(&i915->drm.struct_mutex);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
@@ -800,7 +819,7 @@ last_request_on_engine(struct i915_timeline *timeline,
 
 	rq = i915_active_request_raw(&timeline->last_request,
 				     &engine->i915->drm.struct_mutex);
-	if (rq && rq->engine == engine) {
+	if (rq && rq->engine->mask & engine->mask) {
 		GEM_TRACE("last request on engine %s: %llx:%llu\n",
 			  engine->name, rq->fence.context, rq->fence.seqno);
 		GEM_BUG_ON(rq->timeline != timeline);
@@ -1520,7 +1539,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		return ret;
 
-	ext_data.ctx = i915_gem_create_context(i915);
+	ext_data.ctx = i915_gem_create_context(i915, args->flags);
 	mutex_unlock(&dev->struct_mutex);
 	if (IS_ERR(ext_data.ctx))
 		return PTR_ERR(ext_data.ctx);
diff --git a/drivers/gpu/drm/i915/i915_gem_context_types.h b/drivers/gpu/drm/i915/i915_gem_context_types.h
index 63ae8eb21939..e2ec58b10fb2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/i915_gem_context_types.h
@@ -41,6 +41,8 @@ struct i915_gem_context {
 	/** file_priv: owning file descriptor */
 	struct drm_i915_file_private *file_priv;
 
+	struct i915_timeline *timeline;
+
 	/**
 	 * @ppgtt: unique address space (GTT)
 	 *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 1529824d7c61..e9c2094ab8ea 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -992,6 +992,60 @@ void i915_request_skip(struct i915_request *rq, int error)
 	memset(vaddr + head, 0, rq->postfix - head);
 }
 
+static struct i915_request *
+__i915_request_add_to_timeline(struct i915_request *rq)
+{
+	struct i915_timeline *timeline = rq->timeline;
+	struct i915_request *prev;
+
+	/*
+	 * Dependency tracking and request ordering along the timeline
+	 * is special cased so that we can eliminate redundant ordering
+	 * operations while building the request (we know that the timeline
+	 * itself is ordered, and here we guarantee it).
+	 *
+	 * As we know we will need to emit tracking along the timeline,
+	 * we embed the hooks into our request struct -- at the cost of
+	 * having to have specialised no-allocation interfaces (which will
+	 * be beneficial elsewhere).
+	 *
+	 * A second benefit to open-coding i915_request_await_request is
+	 * that we can apply a slight variant of the rules specialised
+	 * for timelines that jump between engines (such as virtual engines).
+	 * If we consider the case of virtual engine, we must emit a dma-fence
+	 * to prevent scheduling of the second request until the first is
+	 * complete (to maximise our greedy late load balancing) and this
+	 * precludes optimising to use semaphores serialisation of a single
+	 * timeline across engines.
+	 */
+	prev = i915_active_request_raw(&timeline->last_request,
+				       &rq->i915->drm.struct_mutex);
+	if (prev && !i915_request_completed(prev)) {
+		if (is_power_of_2(prev->engine->mask | rq->engine->mask))
+			i915_sw_fence_await_sw_fence(&rq->submit,
+						     &prev->submit,
+						     &rq->submitq);
+		else
+			__i915_sw_fence_await_dma_fence(&rq->submit,
+							&prev->fence,
+							&rq->dmaq);
+		if (rq->engine->schedule)
+			__i915_sched_node_add_dependency(&rq->sched,
+							 &prev->sched,
+							 &rq->dep,
+							 0);
+	}
+
+	spin_lock_irq(&timeline->lock);
+	list_add_tail(&rq->link, &timeline->requests);
+	spin_unlock_irq(&timeline->lock);
+
+	GEM_BUG_ON(timeline->seqno != rq->fence.seqno);
+	__i915_active_request_set(&timeline->last_request, rq);
+
+	return prev;
+}
+
 /*
  * NB: This function is not allowed to fail. Doing so would mean the the
  * request is not being tracked for completion but the work itself is
@@ -1036,31 +1090,7 @@ void i915_request_add(struct i915_request *request)
 	GEM_BUG_ON(IS_ERR(cs));
 	request->postfix = intel_ring_offset(request, cs);
 
-	/*
-	 * Seal the request and mark it as pending execution. Note that
-	 * we may inspect this state, without holding any locks, during
-	 * hangcheck. Hence we apply the barrier to ensure that we do not
-	 * see a more recent value in the hws than we are tracking.
-	 */
-
-	prev = i915_active_request_raw(&timeline->last_request,
-				       &request->i915->drm.struct_mutex);
-	if (prev && !i915_request_completed(prev)) {
-		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
-					     &request->submitq);
-		if (engine->schedule)
-			__i915_sched_node_add_dependency(&request->sched,
-							 &prev->sched,
-							 &request->dep,
-							 0);
-	}
-
-	spin_lock_irq(&timeline->lock);
-	list_add_tail(&request->link, &timeline->requests);
-	spin_unlock_irq(&timeline->lock);
-
-	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
-	__i915_active_request_set(&timeline->last_request, request);
+	prev = __i915_request_add_to_timeline(request);
 
 	list_add_tail(&request->ring_link, &ring->request_list);
 	if (list_is_first(&request->ring_link, &ring->request_list))
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 8c8fa5010644..cd6c130964cd 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -128,7 +128,10 @@ struct i915_request {
 	 * It is used by the driver to then queue the request for execution.
 	 */
 	struct i915_sw_fence submit;
-	wait_queue_entry_t submitq;
+	union {
+		wait_queue_entry_t submitq;
+		struct i915_sw_dma_fence_cb dmaq;
+	};
 	struct list_head execute_cb;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 8d1400d378d7..5387aafd3424 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -359,11 +359,6 @@ int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
 }
 
-struct i915_sw_dma_fence_cb {
-	struct dma_fence_cb base;
-	struct i915_sw_fence *fence;
-};
-
 struct i915_sw_dma_fence_cb_timer {
 	struct i915_sw_dma_fence_cb base;
 	struct dma_fence *dma;
@@ -480,6 +475,40 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 	return ret;
 }
 
+static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
+				     struct dma_fence_cb *data)
+{
+	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	i915_sw_fence_complete(cb->fence);
+}
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb)
+{
+	int ret;
+
+	debug_fence_assert(fence);
+
+	if (dma_fence_is_signaled(dma))
+		return 0;
+
+	cb->fence = fence;
+	i915_sw_fence_await(fence);
+
+	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		i915_sw_fence_complete(fence);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 6dec9e1d1102..9cb5c3b307a6 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -9,14 +9,13 @@
 #ifndef _I915_SW_FENCE_H_
 #define _I915_SW_FENCE_H_
 
+#include <linux/dma-fence.h>
 #include <linux/gfp.h>
 #include <linux/kref.h>
 #include <linux/notifier.h> /* for NOTIFY_DONE */
 #include <linux/wait.h>
 
 struct completion;
-struct dma_fence;
-struct dma_fence_ops;
 struct reservation_object;
 
 struct i915_sw_fence {
@@ -68,10 +67,20 @@ int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 				     struct i915_sw_fence *after,
 				     gfp_t gfp);
+
+struct i915_sw_dma_fence_cb {
+	struct dma_fence_cb base;
+	struct i915_sw_fence *fence;
+};
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb);
 int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 				  struct dma_fence *dma,
 				  unsigned long timeout,
 				  gfp_t gfp);
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 35f7ef9e75c8..66bc3cd4e166 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2802,7 +2802,10 @@ err_unpin_ctx:
 
 static struct i915_timeline *get_timeline(struct i915_gem_context *ctx)
 {
-	return i915_timeline_create(ctx->i915, NULL);
+	if (ctx->timeline)
+		return i915_timeline_get(ctx->timeline);
+	else
+		return i915_timeline_create(ctx->i915, NULL);
 }
 
 static int execlists_context_deferred_alloc(struct intel_context *ce,
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 81e5ace18b81..0426093bf1d9 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -97,7 +97,7 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
 
-	ctx = i915_gem_create_context(i915);
+	ctx = i915_gem_create_context(i915, 0);
 	if (IS_ERR(ctx))
 		return ctx;