From 6faf5916e6beb0dedb0fcbbafbaa152adeaea758 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 28 Dec 2018 14:07:35 +0000
Subject: drm/i915: Remove HW semaphores for gen7 inter-engine synchronisation

The writing is on the wall for the existence of a single execution queue
along each engine, and as a consequence we will not be able to track
dependencies along the HW queue itself, i.e. we will not be able to use
HW semaphores on gen7 as they use a global set of registers (and unlike
gen8+ we can not effectively target memory to keep per-context seqno and
dependencies).

On the positive side, when we implement request reordering for gen7 we
also can not presume a simple execution queue and would also require
removing the current semaphore generation code. So this bring us another
step closer to request reordering for ringbuffer submission!

The negative side is that using interrupts to drive inter-engine
synchronisation is much slower (4us -> 15us to do a nop on each of the 3
engines on ivb). This is much better than it was at the time of introducing
the HW semaphores and equally important userspace weaned itself off
intermixing dependent BLT/RENDER operations (the prime culprit was glyph
rendering in UXA). So while we regress the microbenchmarks, it should not
impact the user.

References: https://bugs.freedesktop.org/show_bug.cgi?id=108888
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20181228140736.32606-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_timeline.h | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index ebd71b487220..38c1e15e927a 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -63,14 +63,6 @@ struct i915_timeline {
 	 * redundant and we can discard it without loss of generality.
 	 */
 	struct i915_syncmap *sync;
-	/**
-	 * Separately to the inter-context seqno map above, we track the last
-	 * barrier (e.g. semaphore wait) to the global engine timelines. Note
-	 * that this tracks global_seqno rather than the context.seqno, and
-	 * so it is subject to the limitations of hw wraparound and that we
-	 * may need to revoke global_seqno (on pre-emption).
-	 */
-	u32 global_sync[I915_NUM_ENGINES];
 
 	struct list_head link;
 	const char *name;
-- 
cgit v1.2.3


From 1e345568e3b541e19202caadae8d2cb2237e7ed8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Jan 2019 10:23:56 +0000
Subject: drm/i915: Move list of timelines under its own lock

Currently, the list of timelines is serialised by the struct_mutex, but
to alleviate difficulties with using that mutex in future, move the
list management under its own dedicated mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190128102356.15037-5-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h                  |   5 +-
 drivers/gpu/drm/i915/i915_gem.c                  | 103 ++++++++++++-----------
 drivers/gpu/drm/i915/i915_reset.c                |   8 +-
 drivers/gpu/drm/i915/i915_timeline.c             |  38 +++++++--
 drivers/gpu/drm/i915/i915_timeline.h             |   3 +
 drivers/gpu/drm/i915/selftests/mock_gem_device.c |   7 +-
 drivers/gpu/drm/i915/selftests/mock_timeline.c   |   3 +-
 7 files changed, 109 insertions(+), 58 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0133d1da3d3c..8a181b455197 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1975,7 +1975,10 @@ struct drm_i915_private {
 		void (*resume)(struct drm_i915_private *);
 		void (*cleanup_engine)(struct intel_engine_cs *engine);
 
-		struct list_head timelines;
+		struct i915_gt_timelines {
+			struct mutex mutex; /* protects list, tainted by GPU */
+			struct list_head list;
+		} timelines;
 
 		struct list_head active_rings;
 		struct list_head closed_vma;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 15acd052da46..761714448ff3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3222,33 +3222,6 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	return ret;
 }
 
-static long wait_for_timeline(struct i915_timeline *tl,
-			      unsigned int flags, long timeout)
-{
-	struct i915_request *rq;
-
-	rq = i915_gem_active_get_unlocked(&tl->last_request);
-	if (!rq)
-		return timeout;
-
-	/*
-	 * "Race-to-idle".
-	 *
-	 * Switching to the kernel context is often used a synchronous
-	 * step prior to idling, e.g. in suspend for flushing all
-	 * current operations to memory before sleeping. These we
-	 * want to complete as quickly as possible to avoid prolonged
-	 * stalls, so allow the gpu to boost to maximum clocks.
-	 */
-	if (flags & I915_WAIT_FOR_IDLE_BOOST)
-		gen6_rps_boost(rq, NULL);
-
-	timeout = i915_request_wait(rq, flags, timeout);
-	i915_request_put(rq);
-
-	return timeout;
-}
-
 static int wait_for_engines(struct drm_i915_private *i915)
 {
 	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
@@ -3262,6 +3235,52 @@ static int wait_for_engines(struct drm_i915_private *i915)
 	return 0;
 }
 
+static long
+wait_for_timelines(struct drm_i915_private *i915,
+		   unsigned int flags, long timeout)
+{
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	struct i915_timeline *tl;
+
+	if (!READ_ONCE(i915->gt.active_requests))
+		return timeout;
+
+	mutex_lock(&gt->mutex);
+	list_for_each_entry(tl, &gt->list, link) {
+		struct i915_request *rq;
+
+		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		if (!rq)
+			continue;
+
+		mutex_unlock(&gt->mutex);
+
+		/*
+		 * "Race-to-idle".
+		 *
+		 * Switching to the kernel context is often used a synchronous
+		 * step prior to idling, e.g. in suspend for flushing all
+		 * current operations to memory before sleeping. These we
+		 * want to complete as quickly as possible to avoid prolonged
+		 * stalls, so allow the gpu to boost to maximum clocks.
+		 */
+		if (flags & I915_WAIT_FOR_IDLE_BOOST)
+			gen6_rps_boost(rq, NULL);
+
+		timeout = i915_request_wait(rq, flags, timeout);
+		i915_request_put(rq);
+		if (timeout < 0)
+			return timeout;
+
+		/* restart after reacquiring the lock */
+		mutex_lock(&gt->mutex);
+		tl = list_entry(&gt->list, typeof(*tl), link);
+	}
+	mutex_unlock(&gt->mutex);
+
+	return timeout;
+}
+
 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 			   unsigned int flags, long timeout)
 {
@@ -3273,17 +3292,15 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 	if (!READ_ONCE(i915->gt.awake))
 		return 0;
 
+	timeout = wait_for_timelines(i915, flags, timeout);
+	if (timeout < 0)
+		return timeout;
+
 	if (flags & I915_WAIT_LOCKED) {
-		struct i915_timeline *tl;
 		int err;
 
 		lockdep_assert_held(&i915->drm.struct_mutex);
 
-		list_for_each_entry(tl, &i915->gt.timelines, link) {
-			timeout = wait_for_timeline(tl, flags, timeout);
-			if (timeout < 0)
-				return timeout;
-		}
 		if (GEM_SHOW_DEBUG() && !timeout) {
 			/* Presume that timeout was non-zero to begin with! */
 			dev_warn(&i915->drm.pdev->dev,
@@ -3297,17 +3314,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 
 		i915_retire_requests(i915);
 		GEM_BUG_ON(i915->gt.active_requests);
-	} else {
-		struct intel_engine_cs *engine;
-		enum intel_engine_id id;
-
-		for_each_engine(engine, i915, id) {
-			struct i915_timeline *tl = &engine->timeline;
-
-			timeout = wait_for_timeline(tl, flags, timeout);
-			if (timeout < 0)
-				return timeout;
-		}
 	}
 
 	return 0;
@@ -5008,6 +5014,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	}
 
+	i915_timelines_init(dev_priv);
+
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
 		return ret;
@@ -5130,8 +5138,10 @@ err_unlock:
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO)
+	if (ret != -EIO) {
 		i915_gem_cleanup_userptr(dev_priv);
+		i915_timelines_fini(dev_priv);
+	}
 
 	if (ret == -EIO) {
 		mutex_lock(&dev_priv->drm.struct_mutex);
@@ -5182,6 +5192,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
 
 	intel_uc_fini_misc(dev_priv);
 	i915_gem_cleanup_userptr(dev_priv);
+	i915_timelines_fini(dev_priv);
 
 	i915_gem_drain_freed_objects(dev_priv);
 
@@ -5284,7 +5295,6 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	if (!dev_priv->priorities)
 		goto err_dependencies;
 
-	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
 
@@ -5328,7 +5338,6 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
-	WARN_ON(!list_empty(&dev_priv->gt.timelines));
 
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 99bd3bc336b3..d2dca85a543d 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -854,7 +854,8 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 *
 	 * No more can be submitted until we reset the wedged bit.
 	 */
-	list_for_each_entry(tl, &i915->gt.timelines, link) {
+	mutex_lock(&i915->gt.timelines.mutex);
+	list_for_each_entry(tl, &i915->gt.timelines.list, link) {
 		struct i915_request *rq;
 		long timeout;
 
@@ -876,9 +877,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		timeout = dma_fence_default_wait(&rq->fence, true,
 						 MAX_SCHEDULE_TIMEOUT);
 		i915_request_put(rq);
-		if (timeout < 0)
+		if (timeout < 0) {
+			mutex_unlock(&i915->gt.timelines.mutex);
 			goto unlock;
+		}
 	}
+	mutex_unlock(&i915->gt.timelines.mutex);
 
 	intel_engines_sanitize(i915, false);
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 4667cc08c416..84550f17d3df 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -13,7 +13,7 @@ void i915_timeline_init(struct drm_i915_private *i915,
 			struct i915_timeline *timeline,
 			const char *name)
 {
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 
 	/*
 	 * Ideally we want a set of engines on a single leaf as we expect
@@ -23,9 +23,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	 */
 	BUILD_BUG_ON(KSYNCMAP < I915_NUM_ENGINES);
 
+	timeline->i915 = i915;
 	timeline->name = name;
 
-	list_add(&timeline->link, &i915->gt.timelines);
+	mutex_lock(&gt->mutex);
+	list_add(&timeline->link, &gt->list);
+	mutex_unlock(&gt->mutex);
 
 	/* Called during early_init before we know how many engines there are */
 
@@ -39,6 +42,17 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	i915_syncmap_init(&timeline->sync);
 }
 
+void i915_timelines_init(struct drm_i915_private *i915)
+{
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+
+	mutex_init(&gt->mutex);
+	INIT_LIST_HEAD(&gt->list);
+
+	/* via i915_gem_wait_for_idle() */
+	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
+}
+
 /**
  * i915_timelines_park - called when the driver idles
  * @i915: the drm_i915_private device
@@ -51,11 +65,11 @@ void i915_timeline_init(struct drm_i915_private *i915,
  */
 void i915_timelines_park(struct drm_i915_private *i915)
 {
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 	struct i915_timeline *timeline;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	list_for_each_entry(timeline, &i915->gt.timelines, link) {
+	mutex_lock(&gt->mutex);
+	list_for_each_entry(timeline, &gt->list, link) {
 		/*
 		 * All known fences are completed so we can scrap
 		 * the current sync point tracking and start afresh,
@@ -64,15 +78,20 @@ void i915_timelines_park(struct drm_i915_private *i915)
 		 */
 		i915_syncmap_free(&timeline->sync);
 	}
+	mutex_unlock(&gt->mutex);
 }
 
 void i915_timeline_fini(struct i915_timeline *timeline)
 {
+	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
 	i915_syncmap_free(&timeline->sync);
 
+	mutex_lock(&gt->mutex);
 	list_del(&timeline->link);
+	mutex_unlock(&gt->mutex);
 }
 
 struct i915_timeline *
@@ -99,6 +118,15 @@ void __i915_timeline_free(struct kref *kref)
 	kfree(timeline);
 }
 
+void i915_timelines_fini(struct drm_i915_private *i915)
+{
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+
+	GEM_BUG_ON(!list_empty(&gt->list));
+
+	mutex_destroy(&gt->mutex);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_timeline.c"
 #include "selftests/i915_timeline.c"
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 38c1e15e927a..87ad2dd31c20 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -66,6 +66,7 @@ struct i915_timeline {
 
 	struct list_head link;
 	const char *name;
+	struct drm_i915_private *i915;
 
 	struct kref kref;
 };
@@ -134,6 +135,8 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
 	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
 }
 
+void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
+void i915_timelines_fini(struct drm_i915_private *i915);
 
 #endif
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 8ab5a2688a0c..14ae46fda49f 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -68,13 +68,14 @@ static void mock_device_release(struct drm_device *dev)
 	i915_gem_contexts_fini(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
+	i915_timelines_fini(i915);
+
 	drain_workqueue(i915->wq);
 	i915_gem_drain_freed_objects(i915);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	mock_fini_ggtt(&i915->ggtt);
 	mutex_unlock(&i915->drm.struct_mutex);
-	WARN_ON(!list_empty(&i915->gt.timelines));
 
 	destroy_workqueue(i915->wq);
 
@@ -226,7 +227,8 @@ struct drm_i915_private *mock_gem_device(void)
 	if (!i915->priorities)
 		goto err_dependencies;
 
-	INIT_LIST_HEAD(&i915->gt.timelines);
+	i915_timelines_init(i915);
+
 	INIT_LIST_HEAD(&i915->gt.active_rings);
 	INIT_LIST_HEAD(&i915->gt.closed_vma);
 
@@ -253,6 +255,7 @@ err_context:
 	i915_gem_contexts_fini(i915);
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_timelines_fini(i915);
 	kmem_cache_destroy(i915->priorities);
 err_dependencies:
 	kmem_cache_destroy(i915->dependencies);
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index dcf3b16f5a07..cf39ccd9fc05 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -10,6 +10,7 @@
 
 void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 {
+	timeline->i915 = NULL;
 	timeline->fence_context = context;
 
 	spin_lock_init(&timeline->lock);
@@ -24,5 +25,5 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 void mock_timeline_fini(struct i915_timeline *timeline)
 {
-	i915_timeline_fini(timeline);
+	i915_syncmap_free(&timeline->sync);
 }
-- 
cgit v1.2.3


From 52954edd1f7030f753a63093c16826ef50805098 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Jan 2019 18:18:09 +0000
Subject: drm/i915: Allocate a status page for each timeline

Allocate a page for use as a status page by a group of timelines, as we
only need a dword of storage for each (rounded up to the cacheline for
safety) we can pack multiple timelines into the same page. Each timeline
will then be able to track its own HW seqno.

v2: Reuse the common per-engine HWSP for the solitary ringbuffer
timeline, so that we do not have to emit (using per-gen specialised
vfuncs) the breadcrumb into the distinct timeline HWSP and instead can
keep on using the common MI_STORE_DWORD_INDEX. However, to maintain the
sleight-of-hand for the global/per-context seqno switchover, we will
store both temporarily (and so use a custom offset for the shared timeline
HWSP until the switch over).

v3: Keep things simple and allocate a page for each timeline, page
sharing comes next.

v4: I was caught repeating the same MI_STORE_DWORD_IMM over and over
again in selftests.

v5: And caught red handed copying create timeline + check.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190128181812.22804-3-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_timeline.c               | 121 +++++++-
 drivers/gpu/drm/i915/i915_timeline.h               |  21 +-
 drivers/gpu/drm/i915/intel_engine_cs.c             |  76 +++--
 drivers/gpu/drm/i915/intel_lrc.c                   |  22 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c            |  10 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h            |   6 +-
 .../gpu/drm/i915/selftests/i915_live_selftests.h   |   1 +
 .../gpu/drm/i915/selftests/i915_mock_selftests.h   |   2 +-
 drivers/gpu/drm/i915/selftests/i915_timeline.c     | 326 ++++++++++++++++++++-
 drivers/gpu/drm/i915/selftests/mock_engine.c       |  14 +-
 10 files changed, 543 insertions(+), 56 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 84550f17d3df..8d5792311a8f 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -9,28 +9,78 @@
 #include "i915_timeline.h"
 #include "i915_syncmap.h"
 
-void i915_timeline_init(struct drm_i915_private *i915,
-			struct i915_timeline *timeline,
-			const char *name)
+static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+
+	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(obj))
+		return ERR_CAST(obj);
+
+	i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC);
+
+	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
+	if (IS_ERR(vma))
+		i915_gem_object_put(obj);
+
+	return vma;
+}
+
+static int hwsp_alloc(struct i915_timeline *timeline)
+{
+	struct i915_vma *vma;
+
+	vma = __hwsp_alloc(timeline->i915);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+
+	timeline->hwsp_ggtt = vma;
+	timeline->hwsp_offset = 0;
+
+	return 0;
+}
+
+int i915_timeline_init(struct drm_i915_private *i915,
+		       struct i915_timeline *timeline,
+		       const char *name,
+		       struct i915_vma *global_hwsp)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	void *vaddr;
+	int err;
 
 	/*
 	 * Ideally we want a set of engines on a single leaf as we expect
 	 * to mostly be tracking synchronisation between engines. It is not
 	 * a huge issue if this is not the case, but we may want to mitigate
 	 * any page crossing penalties if they become an issue.
+	 *
+	 * Called during early_init before we know how many engines there are.
 	 */
 	BUILD_BUG_ON(KSYNCMAP < I915_NUM_ENGINES);
 
 	timeline->i915 = i915;
 	timeline->name = name;
+	timeline->pin_count = 0;
+
+	if (global_hwsp) {
+		timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
+		timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
+	} else {
+		err = hwsp_alloc(timeline);
+		if (err)
+			return err;
+	}
 
-	mutex_lock(&gt->mutex);
-	list_add(&timeline->link, &gt->list);
-	mutex_unlock(&gt->mutex);
+	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
+	if (IS_ERR(vaddr)) {
+		i915_vma_put(timeline->hwsp_ggtt);
+		return PTR_ERR(vaddr);
+	}
 
-	/* Called during early_init before we know how many engines there are */
+	timeline->hwsp_seqno =
+		memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES);
 
 	timeline->fence_context = dma_fence_context_alloc(1);
 
@@ -40,6 +90,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
+
+	mutex_lock(&gt->mutex);
+	list_add(&timeline->link, &gt->list);
+	mutex_unlock(&gt->mutex);
+
+	return 0;
 }
 
 void i915_timelines_init(struct drm_i915_private *i915)
@@ -85,6 +141,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
 
+	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
 	i915_syncmap_free(&timeline->sync);
@@ -92,23 +149,69 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	mutex_lock(&gt->mutex);
 	list_del(&timeline->link);
 	mutex_unlock(&gt->mutex);
+
+	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
+	i915_vma_put(timeline->hwsp_ggtt);
 }
 
 struct i915_timeline *
-i915_timeline_create(struct drm_i915_private *i915, const char *name)
+i915_timeline_create(struct drm_i915_private *i915,
+		     const char *name,
+		     struct i915_vma *global_hwsp)
 {
 	struct i915_timeline *timeline;
+	int err;
 
 	timeline = kzalloc(sizeof(*timeline), GFP_KERNEL);
 	if (!timeline)
 		return ERR_PTR(-ENOMEM);
 
-	i915_timeline_init(i915, timeline, name);
+	err = i915_timeline_init(i915, timeline, name, global_hwsp);
+	if (err) {
+		kfree(timeline);
+		return ERR_PTR(err);
+	}
+
 	kref_init(&timeline->kref);
 
 	return timeline;
 }
 
+int i915_timeline_pin(struct i915_timeline *tl)
+{
+	int err;
+
+	if (tl->pin_count++)
+		return 0;
+	GEM_BUG_ON(!tl->pin_count);
+
+	err = i915_vma_pin(tl->hwsp_ggtt, 0, 0, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto unpin;
+
+	return 0;
+
+unpin:
+	tl->pin_count = 0;
+	return err;
+}
+
+void i915_timeline_unpin(struct i915_timeline *tl)
+{
+	GEM_BUG_ON(!tl->pin_count);
+	if (--tl->pin_count)
+		return;
+
+	/*
+	 * Since this timeline is idle, all bariers upon which we were waiting
+	 * must also be complete and so we can discard the last used barriers
+	 * without loss of information.
+	 */
+	i915_syncmap_free(&tl->sync);
+
+	__i915_vma_unpin(tl->hwsp_ggtt);
+}
+
 void __i915_timeline_free(struct kref *kref)
 {
 	struct i915_timeline *timeline =
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 87ad2dd31c20..0c3739d53d79 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -32,6 +32,8 @@
 #include "i915_syncmap.h"
 #include "i915_utils.h"
 
+struct i915_vma;
+
 struct i915_timeline {
 	u64 fence_context;
 	u32 seqno;
@@ -40,6 +42,11 @@ struct i915_timeline {
 #define TIMELINE_CLIENT 0 /* default subclass */
 #define TIMELINE_ENGINE 1
 
+	unsigned int pin_count;
+	const u32 *hwsp_seqno;
+	struct i915_vma *hwsp_ggtt;
+	u32 hwsp_offset;
+
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
 	 * outstanding.
@@ -71,9 +78,10 @@ struct i915_timeline {
 	struct kref kref;
 };
 
-void i915_timeline_init(struct drm_i915_private *i915,
-			struct i915_timeline *tl,
-			const char *name);
+int i915_timeline_init(struct drm_i915_private *i915,
+		       struct i915_timeline *tl,
+		       const char *name,
+		       struct i915_vma *hwsp);
 void i915_timeline_fini(struct i915_timeline *tl);
 
 static inline void
@@ -96,7 +104,9 @@ i915_timeline_set_subclass(struct i915_timeline *timeline,
 }
 
 struct i915_timeline *
-i915_timeline_create(struct drm_i915_private *i915, const char *name);
+i915_timeline_create(struct drm_i915_private *i915,
+		     const char *name,
+		     struct i915_vma *global_hwsp);
 
 static inline struct i915_timeline *
 i915_timeline_get(struct i915_timeline *timeline)
@@ -135,6 +145,9 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
 	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
 }
 
+int i915_timeline_pin(struct i915_timeline *tl);
+void i915_timeline_unpin(struct i915_timeline *tl);
+
 void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
 void i915_timelines_fini(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 2657eb6fd914..515e87846afd 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -484,26 +484,6 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
 	execlists->queue = RB_ROOT_CACHED;
 }
 
-/**
- * intel_engines_setup_common - setup engine state not requiring hw access
- * @engine: Engine to setup.
- *
- * Initializes @engine@ structure members shared between legacy and execlists
- * submission modes which do not require hardware access.
- *
- * Typically done early in the submission mode specific engine setup stage.
- */
-void intel_engine_setup_common(struct intel_engine_cs *engine)
-{
-	i915_timeline_init(engine->i915, &engine->timeline, engine->name);
-	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
-
-	intel_engine_init_execlist(engine);
-	intel_engine_init_hangcheck(engine);
-	intel_engine_init_batch_pool(engine);
-	intel_engine_init_cmd_parser(engine);
-}
-
 static void cleanup_status_page(struct intel_engine_cs *engine)
 {
 	struct i915_vma *vma;
@@ -601,6 +581,44 @@ err:
 	return ret;
 }
 
+/**
+ * intel_engines_setup_common - setup engine state not requiring hw access
+ * @engine: Engine to setup.
+ *
+ * Initializes @engine@ structure members shared between legacy and execlists
+ * submission modes which do not require hardware access.
+ *
+ * Typically done early in the submission mode specific engine setup stage.
+ */
+int intel_engine_setup_common(struct intel_engine_cs *engine)
+{
+	int err;
+
+	err = init_status_page(engine);
+	if (err)
+		return err;
+
+	err = i915_timeline_init(engine->i915,
+				 &engine->timeline,
+				 engine->name,
+				 engine->status_page.vma);
+	if (err)
+		goto err_hwsp;
+
+	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
+
+	intel_engine_init_execlist(engine);
+	intel_engine_init_hangcheck(engine);
+	intel_engine_init_batch_pool(engine);
+	intel_engine_init_cmd_parser(engine);
+
+	return 0;
+
+err_hwsp:
+	cleanup_status_page(engine);
+	return err;
+}
+
 static void __intel_context_unpin(struct i915_gem_context *ctx,
 				  struct intel_engine_cs *engine)
 {
@@ -617,7 +635,7 @@ struct measure_breadcrumb {
 static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
 {
 	struct measure_breadcrumb *frame;
-	unsigned int dw;
+	int dw = -ENOMEM;
 
 	GEM_BUG_ON(!engine->i915->gt.scratch);
 
@@ -625,7 +643,10 @@ static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
 	if (!frame)
 		return -ENOMEM;
 
-	i915_timeline_init(engine->i915, &frame->timeline, "measure");
+	if (i915_timeline_init(engine->i915,
+			       &frame->timeline, "measure",
+			       engine->status_page.vma))
+		goto out_frame;
 
 	INIT_LIST_HEAD(&frame->ring.request_list);
 	frame->ring.timeline = &frame->timeline;
@@ -642,8 +663,9 @@ static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
 	dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
 
 	i915_timeline_fini(&frame->timeline);
-	kfree(frame);
 
+out_frame:
+	kfree(frame);
 	return dw;
 }
 
@@ -693,20 +715,14 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_unpin_preempt;
 
-	ret = init_status_page(engine);
-	if (ret)
-		goto err_breadcrumbs;
-
 	ret = measure_breadcrumb_dw(engine);
 	if (ret < 0)
-		goto err_status_page;
+		goto err_breadcrumbs;
 
 	engine->emit_breadcrumb_dw = ret;
 
 	return 0;
 
-err_status_page:
-	cleanup_status_page(engine);
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(engine);
 err_unpin_preempt:
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 9ae7f77293a0..e388f37743a2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2206,10 +2206,14 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 }
 
-static void
+static int
 logical_ring_setup(struct intel_engine_cs *engine)
 {
-	intel_engine_setup_common(engine);
+	int err;
+
+	err = intel_engine_setup_common(engine);
+	if (err)
+		return err;
 
 	/* Intentionally left blank. */
 	engine->buffer = NULL;
@@ -2219,6 +2223,8 @@ logical_ring_setup(struct intel_engine_cs *engine)
 
 	logical_ring_default_vfuncs(engine);
 	logical_ring_default_irqs(engine);
+
+	return 0;
 }
 
 static int logical_ring_init(struct intel_engine_cs *engine)
@@ -2267,7 +2273,9 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 {
 	int ret;
 
-	logical_ring_setup(engine);
+	ret = logical_ring_setup(engine);
+	if (ret)
+		return ret;
 
 	/* Override some for render ring. */
 	engine->init_context = gen8_init_rcs_context;
@@ -2296,7 +2304,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 
 int logical_xcs_ring_init(struct intel_engine_cs *engine)
 {
-	logical_ring_setup(engine);
+	int err;
+
+	err = logical_ring_setup(engine);
+	if (err)
+		return err;
 
 	return logical_ring_init(engine);
 }
@@ -2629,7 +2641,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 		goto error_deref_obj;
 	}
 
-	timeline = i915_timeline_create(ctx->i915, ctx->name);
+	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
 	if (IS_ERR(timeline)) {
 		ret = PTR_ERR(timeline);
 		goto error_deref_obj;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index cb6d2aa2a829..174795622eb1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1545,9 +1545,13 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	struct intel_ring *ring;
 	int err;
 
-	intel_engine_setup_common(engine);
+	err = intel_engine_setup_common(engine);
+	if (err)
+		return err;
 
-	timeline = i915_timeline_create(engine->i915, engine->name);
+	timeline = i915_timeline_create(engine->i915,
+					engine->name,
+					engine->status_page.vma);
 	if (IS_ERR(timeline)) {
 		err = PTR_ERR(timeline);
 		goto err;
@@ -1571,6 +1575,8 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	if (err)
 		goto err_unpin;
 
+	GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
+
 	return 0;
 
 err_unpin:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 32371ae67f24..2927b712b973 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -712,7 +712,9 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_INDEX_ADDR		(I915_GEM_HWS_INDEX * sizeof(u32))
 #define I915_GEM_HWS_PREEMPT		0x32
 #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
-#define I915_GEM_HWS_SCRATCH		0x40
+#define I915_GEM_HWS_SEQNO		0x40
+#define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
+#define I915_GEM_HWS_SCRATCH		0x80
 #define I915_GEM_HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
 
 #define I915_HWS_CSB_BUF0_INDEX		0x10
@@ -818,7 +820,7 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 
-void intel_engine_setup_common(struct intel_engine_cs *engine);
+int intel_engine_setup_common(struct intel_engine_cs *engine);
 int intel_engine_init_common(struct intel_engine_cs *engine);
 void intel_engine_cleanup_common(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index a15713cae3b3..76b4f87fc853 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -13,6 +13,7 @@ selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */
 selftest(uncore, intel_uncore_live_selftests)
 selftest(workarounds, intel_workarounds_live_selftests)
 selftest(requests, i915_request_live_selftests)
+selftest(timelines, i915_timeline_live_selftests)
 selftest(objects, i915_gem_object_live_selftests)
 selftest(dmabuf, i915_gem_dmabuf_live_selftests)
 selftest(coherency, i915_gem_coherency_live_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
index 1b70208eeea7..4a83a1c6c406 100644
--- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
@@ -16,7 +16,7 @@ selftest(syncmap, i915_syncmap_mock_selftests)
 selftest(uncore, intel_uncore_mock_selftests)
 selftest(engine, intel_engine_cs_mock_selftests)
 selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
-selftest(timelines, i915_gem_timeline_mock_selftests)
+selftest(timelines, i915_timeline_mock_selftests)
 selftest(requests, i915_request_mock_selftests)
 selftest(objects, i915_gem_object_mock_selftests)
 selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_timeline.c b/drivers/gpu/drm/i915/selftests/i915_timeline.c
index 19f1c6a5c8fb..1585b614510d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/i915_timeline.c
@@ -7,6 +7,7 @@
 #include "../i915_selftest.h"
 #include "i915_random.h"
 
+#include "igt_flush_test.h"
 #include "mock_gem_device.h"
 #include "mock_timeline.h"
 
@@ -256,7 +257,7 @@ static int bench_sync(void *arg)
 	return 0;
 }
 
-int i915_gem_timeline_mock_selftests(void)
+int i915_timeline_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
 		SUBTEST(igt_sync),
@@ -265,3 +266,326 @@ int i915_gem_timeline_mock_selftests(void)
 
 	return i915_subtests(tests, NULL);
 }
+
+static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
+{
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (INTEL_GEN(rq->i915) >= 8) {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = addr;
+		*cs++ = 0;
+		*cs++ = value;
+	} else if (INTEL_GEN(rq->i915) >= 4) {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = 0;
+		*cs++ = addr;
+		*cs++ = value;
+	} else {
+		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+		*cs++ = addr;
+		*cs++ = value;
+		*cs++ = MI_NOOP;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static u32 hwsp_address(const struct i915_timeline *tl)
+{
+	return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
+}
+
+static struct i915_request *
+tl_write(struct i915_timeline *tl, struct intel_engine_cs *engine, u32 value)
+{
+	struct i915_request *rq;
+	int err;
+
+	lockdep_assert_held(&tl->i915->drm.struct_mutex); /* lazy rq refs */
+
+	err = i915_timeline_pin(tl);
+	if (err) {
+		rq = ERR_PTR(err);
+		goto out;
+	}
+
+	rq = i915_request_alloc(engine, engine->i915->kernel_context);
+	if (IS_ERR(rq))
+		goto out_unpin;
+
+	err = emit_ggtt_store_dw(rq, hwsp_address(tl), value);
+	i915_request_add(rq);
+	if (err)
+		rq = ERR_PTR(err);
+
+out_unpin:
+	i915_timeline_unpin(tl);
+out:
+	if (IS_ERR(rq))
+		pr_err("Failed to write to timeline!\n");
+	return rq;
+}
+
+static struct i915_timeline *
+checked_i915_timeline_create(struct drm_i915_private *i915)
+{
+	struct i915_timeline *tl;
+
+	tl = i915_timeline_create(i915, "live", NULL);
+	if (IS_ERR(tl))
+		return tl;
+
+	if (*tl->hwsp_seqno != tl->seqno) {
+		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
+		       *tl->hwsp_seqno, tl->seqno);
+		i915_timeline_put(tl);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return tl;
+}
+
+static int live_hwsp_engine(void *arg)
+{
+#define NUM_TIMELINES 4096
+	struct drm_i915_private *i915 = arg;
+	struct i915_timeline **timelines;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count, n;
+	int err = 0;
+
+	/*
+	 * Create a bunch of timelines and check we can write
+	 * independently to each of their breadcrumb slots.
+	 */
+
+	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
+				   sizeof(*timelines),
+				   GFP_KERNEL);
+	if (!timelines)
+		return -ENOMEM;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for_each_engine(engine, i915, id) {
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		for (n = 0; n < NUM_TIMELINES; n++) {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			tl = checked_i915_timeline_create(i915);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			timelines[count++] = tl;
+		}
+	}
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	for (n = 0; n < count; n++) {
+		struct i915_timeline *tl = timelines[n];
+
+		if (!err && *tl->hwsp_seqno != n) {
+			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+			       n, *tl->hwsp_seqno);
+			err = -EINVAL;
+		}
+		i915_timeline_put(tl);
+	}
+
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	kvfree(timelines);
+
+	return err;
+#undef NUM_TIMELINES
+}
+
+static int live_hwsp_alternate(void *arg)
+{
+#define NUM_TIMELINES 4096
+	struct drm_i915_private *i915 = arg;
+	struct i915_timeline **timelines;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count, n;
+	int err = 0;
+
+	/*
+	 * Create a bunch of timelines and check we can write
+	 * independently to each of their breadcrumb slots with adjacent
+	 * engines.
+	 */
+
+	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
+				   sizeof(*timelines),
+				   GFP_KERNEL);
+	if (!timelines)
+		return -ENOMEM;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for (n = 0; n < NUM_TIMELINES; n++) {
+		for_each_engine(engine, i915, id) {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			if (!intel_engine_can_store_dword(engine))
+				continue;
+
+			tl = checked_i915_timeline_create(i915);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			timelines[count++] = tl;
+		}
+	}
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	for (n = 0; n < count; n++) {
+		struct i915_timeline *tl = timelines[n];
+
+		if (!err && *tl->hwsp_seqno != n) {
+			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+			       n, *tl->hwsp_seqno);
+			err = -EINVAL;
+		}
+		i915_timeline_put(tl);
+	}
+
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	kvfree(timelines);
+
+	return err;
+#undef NUM_TIMELINES
+}
+
+static int live_hwsp_recycle(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count;
+	int err = 0;
+
+	/*
+	 * Check seqno writes into one timeline at a time. We expect to
+	 * recycle the breadcrumb slot between iterations and neither
+	 * want to confuse ourselves or the GPU.
+	 */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for_each_engine(engine, i915, id) {
+		IGT_TIMEOUT(end_time);
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		do {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			tl = checked_i915_timeline_create(i915);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			if (i915_request_wait(rq,
+					      I915_WAIT_LOCKED,
+					      HZ / 5) < 0) {
+				pr_err("Wait for timeline writes timed out!\n");
+				i915_timeline_put(tl);
+				err = -EIO;
+				goto out;
+			}
+
+			if (*tl->hwsp_seqno != count) {
+				pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+				       count, *tl->hwsp_seqno);
+				err = -EINVAL;
+			}
+
+			i915_timeline_put(tl);
+			count++;
+
+			if (err)
+				goto out;
+
+			i915_timelines_park(i915); /* Encourage recycling! */
+		} while (!__igt_timeout(end_time, NULL));
+	}
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return err;
+}
+
+int i915_timeline_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_hwsp_recycle),
+		SUBTEST(live_hwsp_engine),
+		SUBTEST(live_hwsp_alternate),
+	};
+
+	return i915_subtests(tests, i915);
+}
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 4e5b4dc6df0f..919c89fd6ee5 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -39,7 +39,12 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
 	if (!ring)
 		return NULL;
 
-	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
+	if (i915_timeline_init(engine->i915,
+			       &ring->timeline, engine->name,
+			       NULL)) {
+		kfree(ring);
+		return NULL;
+	}
 
 	ring->base.size = sz;
 	ring->base.effective_size = sz;
@@ -208,7 +213,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
 
-	i915_timeline_init(i915, &engine->base.timeline, engine->base.name);
+	if (i915_timeline_init(i915,
+			       &engine->base.timeline,
+			       engine->base.name,
+			       NULL))
+		goto err_free;
 	i915_timeline_set_subclass(&engine->base.timeline, TIMELINE_ENGINE);
 
 	intel_engine_init_breadcrumbs(&engine->base);
@@ -226,6 +235,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(&engine->base);
 	i915_timeline_fini(&engine->base.timeline);
+err_free:
 	kfree(engine);
 	return NULL;
 }
-- 
cgit v1.2.3


From 8ba306a6a362ef6f3c005ec8819c8890a6fadcd1 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 28 Jan 2019 18:18:10 +0000
Subject: drm/i915: Share per-timeline HWSP using a slab suballocator

If we restrict ourselves to only using a cacheline for each timeline's
HWSP (we could go smaller, but want to avoid needless polluting
cachelines on different engines between different contexts), then we can
suballocate a single 4k page into 64 different timeline HWSP. By
treating each fresh allocation as a slab of 64 entries, we can keep it
around for the next 64 allocation attempts until we need to refresh the
slab cache.

John Harrison noted the issue of fragmentation leading to the same worst
case performance of one page per timeline as before, which can be
mitigated by adopting a freelist.

v2: Keep all partially allocated HWSP on a freelist

This is still without migration, so it is possible for the system to end
up with each timeline in its own page, but we ensure that no new
allocation would needless allocate a fresh page!

v3: Throw a selftest at the allocator to try and catch invalid cacheline
reuse.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190128181812.22804-4-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_drv.h                |   4 +
 drivers/gpu/drm/i915/i915_timeline.c           | 124 +++++++++++++++++----
 drivers/gpu/drm/i915/i915_timeline.h           |   1 +
 drivers/gpu/drm/i915/selftests/i915_random.c   |  33 ++++--
 drivers/gpu/drm/i915/selftests/i915_random.h   |   3 +
 drivers/gpu/drm/i915/selftests/i915_timeline.c | 143 +++++++++++++++++++++++++
 6 files changed, 280 insertions(+), 28 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8a181b455197..6a051381f535 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1978,6 +1978,10 @@ struct drm_i915_private {
 		struct i915_gt_timelines {
 			struct mutex mutex; /* protects list, tainted by GPU */
 			struct list_head list;
+
+			/* Pack multiple timelines' seqnos into the same page */
+			spinlock_t hwsp_lock;
+			struct list_head hwsp_free_list;
 		} timelines;
 
 		struct list_head active_rings;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 8d5792311a8f..add8fc33cf6e 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -9,6 +9,18 @@
 #include "i915_timeline.h"
 #include "i915_syncmap.h"
 
+struct i915_timeline_hwsp {
+	struct i915_vma *vma;
+	struct list_head free_link;
+	u64 free_bitmap;
+};
+
+static inline struct i915_timeline_hwsp *
+i915_timeline_hwsp(const struct i915_timeline *tl)
+{
+	return tl->hwsp_ggtt->private;
+}
+
 static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
 {
 	struct drm_i915_gem_object *obj;
@@ -27,28 +39,89 @@ static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
 	return vma;
 }
 
-static int hwsp_alloc(struct i915_timeline *timeline)
+static struct i915_vma *
+hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
 {
-	struct i915_vma *vma;
+	struct drm_i915_private *i915 = timeline->i915;
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	struct i915_timeline_hwsp *hwsp;
 
-	vma = __hwsp_alloc(timeline->i915);
-	if (IS_ERR(vma))
-		return PTR_ERR(vma);
+	BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE);
 
-	timeline->hwsp_ggtt = vma;
-	timeline->hwsp_offset = 0;
+	spin_lock(&gt->hwsp_lock);
 
-	return 0;
+	/* hwsp_free_list only contains HWSP that have available cachelines */
+	hwsp = list_first_entry_or_null(&gt->hwsp_free_list,
+					typeof(*hwsp), free_link);
+	if (!hwsp) {
+		struct i915_vma *vma;
+
+		spin_unlock(&gt->hwsp_lock);
+
+		hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
+		if (!hwsp)
+			return ERR_PTR(-ENOMEM);
+
+		vma = __hwsp_alloc(i915);
+		if (IS_ERR(vma)) {
+			kfree(hwsp);
+			return vma;
+		}
+
+		vma->private = hwsp;
+		hwsp->vma = vma;
+		hwsp->free_bitmap = ~0ull;
+
+		spin_lock(&gt->hwsp_lock);
+		list_add(&hwsp->free_link, &gt->hwsp_free_list);
+	}
+
+	GEM_BUG_ON(!hwsp->free_bitmap);
+	*cacheline = __ffs64(hwsp->free_bitmap);
+	hwsp->free_bitmap &= ~BIT_ULL(*cacheline);
+	if (!hwsp->free_bitmap)
+		list_del(&hwsp->free_link);
+
+	spin_unlock(&gt->hwsp_lock);
+
+	GEM_BUG_ON(hwsp->vma->private != hwsp);
+	return hwsp->vma;
+}
+
+static void hwsp_free(struct i915_timeline *timeline)
+{
+	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+	struct i915_timeline_hwsp *hwsp;
+
+	hwsp = i915_timeline_hwsp(timeline);
+	if (!hwsp) /* leave global HWSP alone! */
+		return;
+
+	spin_lock(&gt->hwsp_lock);
+
+	/* As a cacheline becomes available, publish the HWSP on the freelist */
+	if (!hwsp->free_bitmap)
+		list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
+
+	hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+
+	/* And if no one is left using it, give the page back to the system */
+	if (hwsp->free_bitmap == ~0ull) {
+		i915_vma_put(hwsp->vma);
+		list_del(&hwsp->free_link);
+		kfree(hwsp);
+	}
+
+	spin_unlock(&gt->hwsp_lock);
 }
 
 int i915_timeline_init(struct drm_i915_private *i915,
 		       struct i915_timeline *timeline,
 		       const char *name,
-		       struct i915_vma *global_hwsp)
+		       struct i915_vma *hwsp)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
 	void *vaddr;
-	int err;
 
 	/*
 	 * Ideally we want a set of engines on a single leaf as we expect
@@ -64,18 +137,22 @@ int i915_timeline_init(struct drm_i915_private *i915,
 	timeline->name = name;
 	timeline->pin_count = 0;
 
-	if (global_hwsp) {
-		timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
-		timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
-	} else {
-		err = hwsp_alloc(timeline);
-		if (err)
-			return err;
+	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
+	if (!hwsp) {
+		unsigned int cacheline;
+
+		hwsp = hwsp_alloc(timeline, &cacheline);
+		if (IS_ERR(hwsp))
+			return PTR_ERR(hwsp);
+
+		timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
 	}
+	timeline->hwsp_ggtt = i915_vma_get(hwsp);
 
-	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
+	vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
-		i915_vma_put(timeline->hwsp_ggtt);
+		hwsp_free(timeline);
+		i915_vma_put(hwsp);
 		return PTR_ERR(vaddr);
 	}
 
@@ -105,6 +182,9 @@ void i915_timelines_init(struct drm_i915_private *i915)
 	mutex_init(&gt->mutex);
 	INIT_LIST_HEAD(&gt->list);
 
+	spin_lock_init(&gt->hwsp_lock);
+	INIT_LIST_HEAD(&gt->hwsp_free_list);
+
 	/* via i915_gem_wait_for_idle() */
 	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
 }
@@ -144,12 +224,13 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
-	i915_syncmap_free(&timeline->sync);
-
 	mutex_lock(&gt->mutex);
 	list_del(&timeline->link);
 	mutex_unlock(&gt->mutex);
 
+	i915_syncmap_free(&timeline->sync);
+	hwsp_free(timeline);
+
 	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
 	i915_vma_put(timeline->hwsp_ggtt);
 }
@@ -226,6 +307,7 @@ void i915_timelines_fini(struct drm_i915_private *i915)
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
 
 	GEM_BUG_ON(!list_empty(&gt->list));
+	GEM_BUG_ON(!list_empty(&gt->hwsp_free_list));
 
 	mutex_destroy(&gt->mutex);
 }
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 0c3739d53d79..ab736e2e5707 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -33,6 +33,7 @@
 #include "i915_utils.h"
 
 struct i915_vma;
+struct i915_timeline_hwsp;
 
 struct i915_timeline {
 	u64 fence_context;
diff --git a/drivers/gpu/drm/i915/selftests/i915_random.c b/drivers/gpu/drm/i915/selftests/i915_random.c
index 1f415ce47018..716a3f19f030 100644
--- a/drivers/gpu/drm/i915/selftests/i915_random.c
+++ b/drivers/gpu/drm/i915/selftests/i915_random.c
@@ -41,18 +41,37 @@ u64 i915_prandom_u64_state(struct rnd_state *rnd)
 	return x;
 }
 
-void i915_random_reorder(unsigned int *order, unsigned int count,
-			 struct rnd_state *state)
+void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
+			  struct rnd_state *state)
 {
-	unsigned int i, j;
+	char stack[128];
+
+	if (WARN_ON(elsz > sizeof(stack) || count > U32_MAX))
+		return;
+
+	if (!elsz || !count)
+		return;
+
+	/* Fisher-Yates shuffle courtesy of Knuth */
+	while (--count) {
+		size_t swp;
+
+		swp = i915_prandom_u32_max_state(count + 1, state);
+		if (swp == count)
+			continue;
 
-	for (i = 0; i < count; i++) {
-		BUILD_BUG_ON(sizeof(unsigned int) > sizeof(u32));
-		j = i915_prandom_u32_max_state(count, state);
-		swap(order[i], order[j]);
+		memcpy(stack, arr + count * elsz, elsz);
+		memcpy(arr + count * elsz, arr + swp * elsz, elsz);
+		memcpy(arr + swp * elsz, stack, elsz);
 	}
 }
 
+void i915_random_reorder(unsigned int *order, unsigned int count,
+			 struct rnd_state *state)
+{
+	i915_prandom_shuffle(order, sizeof(*order), count, state);
+}
+
 unsigned int *i915_random_order(unsigned int count, struct rnd_state *state)
 {
 	unsigned int *order, i;
diff --git a/drivers/gpu/drm/i915/selftests/i915_random.h b/drivers/gpu/drm/i915/selftests/i915_random.h
index 7dffedc501ca..8e1ff9c105b6 100644
--- a/drivers/gpu/drm/i915/selftests/i915_random.h
+++ b/drivers/gpu/drm/i915/selftests/i915_random.h
@@ -54,4 +54,7 @@ void i915_random_reorder(unsigned int *order,
 			 unsigned int count,
 			 struct rnd_state *state);
 
+void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
+			  struct rnd_state *state);
+
 #endif /* !__I915_SELFTESTS_RANDOM_H__ */
diff --git a/drivers/gpu/drm/i915/selftests/i915_timeline.c b/drivers/gpu/drm/i915/selftests/i915_timeline.c
index 1585b614510d..c34340f074cf 100644
--- a/drivers/gpu/drm/i915/selftests/i915_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/i915_timeline.c
@@ -4,6 +4,8 @@
  * Copyright © 2017-2018 Intel Corporation
  */
 
+#include <linux/prime_numbers.h>
+
 #include "../i915_selftest.h"
 #include "i915_random.h"
 
@@ -11,6 +13,146 @@
 #include "mock_gem_device.h"
 #include "mock_timeline.h"
 
+static struct page *hwsp_page(struct i915_timeline *tl)
+{
+	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
+
+	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
+	return sg_page(obj->mm.pages->sgl);
+}
+
+static unsigned long hwsp_cacheline(struct i915_timeline *tl)
+{
+	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
+
+	return (address + tl->hwsp_offset) / CACHELINE_BYTES;
+}
+
+#define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
+
+struct mock_hwsp_freelist {
+	struct drm_i915_private *i915;
+	struct radix_tree_root cachelines;
+	struct i915_timeline **history;
+	unsigned long count, max;
+	struct rnd_state prng;
+};
+
+enum {
+	SHUFFLE = BIT(0),
+};
+
+static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
+			       unsigned int idx,
+			       struct i915_timeline *tl)
+{
+	tl = xchg(&state->history[idx], tl);
+	if (tl) {
+		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
+		i915_timeline_put(tl);
+	}
+}
+
+static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
+				unsigned int count,
+				unsigned int flags)
+{
+	struct i915_timeline *tl;
+	unsigned int idx;
+
+	while (count--) {
+		unsigned long cacheline;
+		int err;
+
+		tl = i915_timeline_create(state->i915, "mock", NULL);
+		if (IS_ERR(tl))
+			return PTR_ERR(tl);
+
+		cacheline = hwsp_cacheline(tl);
+		err = radix_tree_insert(&state->cachelines, cacheline, tl);
+		if (err) {
+			if (err == -EEXIST) {
+				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
+				       cacheline);
+			}
+			i915_timeline_put(tl);
+			return err;
+		}
+
+		idx = state->count++ % state->max;
+		__mock_hwsp_record(state, idx, tl);
+	}
+
+	if (flags & SHUFFLE)
+		i915_prandom_shuffle(state->history,
+				     sizeof(*state->history),
+				     min(state->count, state->max),
+				     &state->prng);
+
+	count = i915_prandom_u32_max_state(min(state->count, state->max),
+					   &state->prng);
+	while (count--) {
+		idx = --state->count % state->max;
+		__mock_hwsp_record(state, idx, NULL);
+	}
+
+	return 0;
+}
+
+static int mock_hwsp_freelist(void *arg)
+{
+	struct mock_hwsp_freelist state;
+	const struct {
+		const char *name;
+		unsigned int flags;
+	} phases[] = {
+		{ "linear", 0 },
+		{ "shuffled", SHUFFLE },
+		{ },
+	}, *p;
+	unsigned int na;
+	int err = 0;
+
+	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
+	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
+
+	state.i915 = mock_gem_device();
+	if (!state.i915)
+		return -ENOMEM;
+
+	/*
+	 * Create a bunch of timelines and check that their HWSP do not overlap.
+	 * Free some, and try again.
+	 */
+
+	state.max = PAGE_SIZE / sizeof(*state.history);
+	state.count = 0;
+	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
+	if (!state.history) {
+		err = -ENOMEM;
+		goto err_put;
+	}
+
+	mutex_lock(&state.i915->drm.struct_mutex);
+	for (p = phases; p->name; p++) {
+		pr_debug("%s(%s)\n", __func__, p->name);
+		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
+			err = __mock_hwsp_timeline(&state, na, p->flags);
+			if (err)
+				goto out;
+		}
+	}
+
+out:
+	for (na = 0; na < state.max; na++)
+		__mock_hwsp_record(&state, na, NULL);
+	mutex_unlock(&state.i915->drm.struct_mutex);
+	kfree(state.history);
+err_put:
+	drm_dev_put(&state.i915->drm);
+	return err;
+}
+
 struct __igt_sync {
 	const char *name;
 	u32 seqno;
@@ -260,6 +402,7 @@ static int bench_sync(void *arg)
 int i915_timeline_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
+		SUBTEST(mock_hwsp_freelist),
 		SUBTEST(igt_sync),
 		SUBTEST(bench_sync),
 	};
-- 
cgit v1.2.3


From 8547444137ec6138ce52fc1938980b737a0d4d9e Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 29 Jan 2019 18:54:50 +0000
Subject: drm/i915: Identify active requests

To allow requests to forgo a common execution timeline, one question we
need to be able to answer is "is this request running?". To track
whether a request has started on HW, we can emit a breadcrumb at the
beginning of the request and check its timeline's HWSP to see if the
breadcrumb has advanced past the start of this request. (This is in
contrast to the global timeline where we need only ask if we are on the
global timeline and if the timeline has advanced past the end of the
previous request.)

There is still confusion from a preempted request, which has already
started but relinquished the HW to a high priority request. For the
common case, this discrepancy should be negligible. However, for
identification of hung requests, knowing which one was running at the
time of the hang will be much more important.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190129185452.20989-2-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_gem.c              | 15 +++++++++++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c   | 12 +++++++++
 drivers/gpu/drm/i915/i915_request.c          | 10 +++----
 drivers/gpu/drm/i915/i915_request.h          |  1 +
 drivers/gpu/drm/i915/i915_timeline.c         |  1 +
 drivers/gpu/drm/i915/i915_timeline.h         |  2 ++
 drivers/gpu/drm/i915/intel_engine_cs.c       |  8 +++---
 drivers/gpu/drm/i915/intel_lrc.c             | 39 ++++++++++++++++++++++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 25 +++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  6 +++--
 drivers/gpu/drm/i915/selftests/mock_engine.c |  2 +-
 11 files changed, 96 insertions(+), 25 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9c499edb4c13..d92e7ab0005e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2871,6 +2871,14 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
+static bool match_ring(struct i915_request *rq)
+{
+	struct drm_i915_private *dev_priv = rq->i915;
+	u32 ring = I915_READ(RING_START(rq->engine->mmio_base));
+
+	return ring == i915_ggtt_offset(rq->ring->vma);
+}
+
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine)
 {
@@ -2893,6 +2901,13 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 		if (i915_request_completed(request))
 			continue;
 
+		if (!i915_request_started(request))
+			break;
+
+		/* More than one preemptible request may match! */
+		if (!match_ring(request))
+			break;
+
 		active = request;
 		break;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f250109e1f66..8eedf7cac493 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1976,6 +1976,18 @@ static int eb_submit(struct i915_execbuffer *eb)
 			return err;
 	}
 
+	/*
+	 * After we completed waiting for other engines (using HW semaphores)
+	 * then we can signal that this request/batch is ready to run. This
+	 * allows us to determine if the batch is still waiting on the GPU
+	 * or actually running by checking the breadcrumb.
+	 */
+	if (eb->engine->emit_init_breadcrumb) {
+		err = eb->engine->emit_init_breadcrumb(eb->request);
+		if (err)
+			return err;
+	}
+
 	err = eb->engine->emit_bb_start(eb->request,
 					eb->batch->node.start +
 					eb->batch_start_offset,
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 4d58770e6a8c..7db15b7b3de8 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -333,7 +333,7 @@ void i915_request_retire_upto(struct i915_request *rq)
 
 static u32 timeline_get_seqno(struct i915_timeline *tl)
 {
-	return ++tl->seqno;
+	return tl->seqno += 1 + tl->has_initial_breadcrumb;
 }
 
 static void move_to_timeline(struct i915_request *request,
@@ -382,8 +382,8 @@ void __i915_request_submit(struct i915_request *request)
 		intel_engine_enable_signaling(request, false);
 	spin_unlock(&request->lock);
 
-	engine->emit_breadcrumb(request,
-				request->ring->vaddr + request->postfix);
+	engine->emit_fini_breadcrumb(request,
+				     request->ring->vaddr + request->postfix);
 
 	/* Transfer from per-context onto the global per-engine timeline */
 	move_to_timeline(request, &engine->timeline);
@@ -657,7 +657,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * around inside i915_request_add() there is sufficient space at
 	 * the beginning of the ring as well.
 	 */
-	rq->reserved_space = 2 * engine->emit_breadcrumb_dw * sizeof(u32);
+	rq->reserved_space = 2 * engine->emit_fini_breadcrumb_dw * sizeof(u32);
 
 	/*
 	 * Record the position of the start of the request so that
@@ -908,7 +908,7 @@ void i915_request_add(struct i915_request *request)
 	 * GPU processing the request, we never over-estimate the
 	 * position of the ring's HEAD.
 	 */
-	cs = intel_ring_begin(request, engine->emit_breadcrumb_dw);
+	cs = intel_ring_begin(request, engine->emit_fini_breadcrumb_dw);
 	GEM_BUG_ON(IS_ERR(cs));
 	request->postfix = intel_ring_offset(request, cs);
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 96c586d6ff4d..340d6216791c 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -344,6 +344,7 @@ static inline bool i915_request_started(const struct i915_request *rq)
 	if (i915_request_signaled(rq))
 		return true;
 
+	/* Remember: started but may have since been preempted! */
 	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 79838d89bdb9..5ea3af393ffe 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -135,6 +135,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 	timeline->i915 = i915;
 	timeline->name = name;
 	timeline->pin_count = 0;
+	timeline->has_initial_breadcrumb = !hwsp;
 
 	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
 	if (!hwsp) {
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index ab736e2e5707..8caeb66d1cd5 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -48,6 +48,8 @@ struct i915_timeline {
 	struct i915_vma *hwsp_ggtt;
 	u32 hwsp_offset;
 
+	bool has_initial_breadcrumb;
+
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
 	 * outstanding.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index ead9c4371fe1..8dca76f6315d 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -664,7 +664,7 @@ static int measure_breadcrumb_dw(struct intel_engine_cs *engine)
 	if (dw < 0)
 		goto out_timeline;
 
-	dw = engine->emit_breadcrumb(&frame->rq, frame->cs) - frame->cs;
+	dw = engine->emit_fini_breadcrumb(&frame->rq, frame->cs) - frame->cs;
 
 	i915_timeline_unpin(&frame->timeline);
 
@@ -725,7 +725,7 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret < 0)
 		goto err_breadcrumbs;
 
-	engine->emit_breadcrumb_dw = ret;
+	engine->emit_fini_breadcrumb_dw = ret;
 
 	return 0;
 
@@ -1297,7 +1297,9 @@ static void print_request(struct drm_printer *m,
 	drm_printf(m, "%s%x%s [%llx:%llx]%s @ %dms: %s\n",
 		   prefix,
 		   rq->global_seqno,
-		   i915_request_completed(rq) ? "!" : "",
+		   i915_request_completed(rq) ? "!" :
+		   i915_request_started(rq) ? "*" :
+		   "",
 		   rq->fence.context, rq->fence.seqno,
 		   buf,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fdbb3fe8eac9..5db16dd8e844 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -624,7 +624,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		 * WaIdleLiteRestore:bdw,skl
 		 * Apply the wa NOOPs to prevent
 		 * ring:HEAD == rq:TAIL as we resubmit the
-		 * request. See gen8_emit_breadcrumb() for
+		 * request. See gen8_emit_fini_breadcrumb() for
 		 * where we prepare the padding after the
 		 * end of the request.
 		 */
@@ -1283,6 +1283,34 @@ execlists_context_pin(struct intel_engine_cs *engine,
 	return __execlists_context_pin(engine, ctx, ce);
 }
 
+static int gen8_emit_init_breadcrumb(struct i915_request *rq)
+{
+	u32 *cs;
+
+	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/*
+	 * Check if we have been preempted before we even get started.
+	 *
+	 * After this point i915_request_started() reports true, even if
+	 * we get preempted and so are no longer running.
+	 */
+	*cs++ = MI_ARB_CHECK;
+	*cs++ = MI_NOOP;
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = rq->timeline->hwsp_offset;
+	*cs++ = 0;
+	*cs++ = rq->fence.seqno - 1;
+
+	intel_ring_advance(rq, cs);
+	return 0;
+}
+
 static int emit_pdps(struct i915_request *rq)
 {
 	const struct intel_engine_cs * const engine = rq->engine;
@@ -2039,7 +2067,7 @@ static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
 	return cs;
 }
 
-static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
+static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
 {
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
@@ -2061,7 +2089,7 @@ static u32 *gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 	return gen8_emit_wa_tail(request, cs);
 }
 
-static u32 *gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
+static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
 	cs = gen8_emit_ggtt_write_rcs(cs,
 				      request->fence.seqno,
@@ -2176,7 +2204,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->request_alloc = execlists_request_alloc;
 
 	engine->emit_flush = gen8_emit_flush;
-	engine->emit_breadcrumb = gen8_emit_breadcrumb;
+	engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
+	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
 
 	engine->set_default_submission = intel_execlists_set_default_submission;
 
@@ -2289,7 +2318,7 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 	/* Override some for render ring. */
 	engine->init_context = gen8_init_rcs_context;
 	engine->emit_flush = gen8_emit_flush_render;
-	engine->emit_breadcrumb = gen8_emit_breadcrumb_rcs;
+	engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
 
 	ret = logical_ring_init(engine);
 	if (ret)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index ee3719324e2d..668ed67336a2 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1607,6 +1607,7 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 		err = PTR_ERR(timeline);
 		goto err;
 	}
+	GEM_BUG_ON(timeline->has_initial_breadcrumb);
 
 	ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
 	i915_timeline_put(timeline);
@@ -1960,6 +1961,7 @@ static int ring_request_alloc(struct i915_request *request)
 	int ret;
 
 	GEM_BUG_ON(!request->hw_context->pin_count);
+	GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
 
 	/*
 	 * Flush enough space to reduce the likelihood of waiting after
@@ -2296,9 +2298,14 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 	engine->context_pin = intel_ring_context_pin;
 	engine->request_alloc = ring_request_alloc;
 
-	engine->emit_breadcrumb = i9xx_emit_breadcrumb;
+	/*
+	 * Using a global execution timeline; the previous final breadcrumb is
+	 * equivalent to our next initial bread so we can elide
+	 * engine->emit_init_breadcrumb().
+	 */
+	engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb;
 	if (IS_GEN(dev_priv, 5))
-		engine->emit_breadcrumb = gen5_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen5_emit_breadcrumb;
 
 	engine->set_default_submission = i9xx_set_default_submission;
 
@@ -2327,11 +2334,11 @@ int intel_init_render_ring_buffer(struct intel_engine_cs *engine)
 	if (INTEL_GEN(dev_priv) >= 7) {
 		engine->init_context = intel_rcs_ctx_init;
 		engine->emit_flush = gen7_render_ring_flush;
-		engine->emit_breadcrumb = gen7_rcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb;
 	} else if (IS_GEN(dev_priv, 6)) {
 		engine->init_context = intel_rcs_ctx_init;
 		engine->emit_flush = gen6_render_ring_flush;
-		engine->emit_breadcrumb = gen6_rcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb;
 	} else if (IS_GEN(dev_priv, 5)) {
 		engine->emit_flush = gen4_render_ring_flush;
 	} else {
@@ -2368,9 +2375,9 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 		engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
 
 		if (IS_GEN(dev_priv, 6))
-			engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
 		else
-			engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
+			engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
 	} else {
 		engine->emit_flush = bsd_ring_flush;
 		if (IS_GEN(dev_priv, 5))
@@ -2394,9 +2401,9 @@ int intel_init_blt_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable_mask = GT_BLT_USER_INTERRUPT;
 
 	if (IS_GEN(dev_priv, 6))
-		engine->emit_breadcrumb = gen6_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb;
 	else
-		engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
+		engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
 
 	return intel_init_ring_buffer(engine);
 }
@@ -2414,7 +2421,7 @@ int intel_init_vebox_ring_buffer(struct intel_engine_cs *engine)
 	engine->irq_enable = hsw_vebox_irq_enable;
 	engine->irq_disable = hsw_vebox_irq_disable;
 
-	engine->emit_breadcrumb = gen7_xcs_emit_breadcrumb;
+	engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb;
 
 	return intel_init_ring_buffer(engine);
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 2927b712b973..1f30ffb84936 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -463,8 +463,10 @@ struct intel_engine_cs {
 					 unsigned int dispatch_flags);
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
-	u32		*(*emit_breadcrumb)(struct i915_request *rq, u32 *cs);
-	int		emit_breadcrumb_dw;
+	int		 (*emit_init_breadcrumb)(struct i915_request *rq);
+	u32		*(*emit_fini_breadcrumb)(struct i915_request *rq,
+						 u32 *cs);
+	unsigned int	emit_fini_breadcrumb_dw;
 
 	/* Pass the request to the hardware queue (e.g. directly into
 	 * the legacy ringbuffer or to the end of an execlist).
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 95e890d7f58b..3b226ebc6bc4 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -227,7 +227,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.context_pin = mock_context_pin;
 	engine->base.request_alloc = mock_request_alloc;
 	engine->base.emit_flush = mock_emit_flush;
-	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
+	engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
 
 	if (i915_timeline_init(i915,
-- 
cgit v1.2.3


From 7810858412a0ab8b8ebb97d301dd601808968c88 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Date: Tue, 5 Feb 2019 09:50:30 +0000
Subject: drm/i915: Add timeline barrier support

Timeline barrier allows serialization between different timelines.

After calling i915_timeline_set_barrier with a request, all following
submissions on this timeline will be set up as depending on this request,
or barrier. Once the barrier has been completed it automatically gets
cleared and things continue as normal.

This facility will be used by the upcoming context SSEU code.

v2:
 * Assert barrier has been retired on timeline_fini. (Chris Wilson)
 * Fix mock_timeline.

v3:
 * Improved comment language. (Chris Wilson)

v4:
 * Maintain ordering with previous barriers set on the timeline.

v5:
 * Rebase.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: https://patchwork.freedesktop.org/patch/msgid/20190205095032.22673-3-tvrtko.ursulin@linux.intel.com
---
 drivers/gpu/drm/i915/i915_request.c            | 17 +++++++++++++++++
 drivers/gpu/drm/i915/i915_timeline.c           | 21 +++++++++++++++++++++
 drivers/gpu/drm/i915/i915_timeline.h           | 22 ++++++++++++++++++++++
 drivers/gpu/drm/i915/selftests/mock_timeline.c |  1 +
 4 files changed, 61 insertions(+)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9383a9fb4893..6512630b59b8 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -526,6 +526,19 @@ out:
 	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
 }
 
+static int add_barrier(struct i915_request *rq, struct i915_gem_active *active)
+{
+	struct i915_request *barrier =
+		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
+
+	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
+}
+
+static int add_timeline_barrier(struct i915_request *rq)
+{
+	return add_barrier(rq, &rq->timeline->barrier);
+}
+
 /**
  * i915_request_alloc - allocate a request structure
  *
@@ -668,6 +681,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 */
 	rq->head = rq->ring->emit;
 
+	ret = add_timeline_barrier(rq);
+	if (ret)
+		goto err_unwind;
+
 	ret = engine->request_alloc(rq);
 	if (ret)
 		goto err_unwind;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 5ea3af393ffe..dcff3ae96683 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -163,6 +163,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	spin_lock_init(&timeline->lock);
 
+	init_request_active(&timeline->barrier, NULL);
 	init_request_active(&timeline->last_request, NULL);
 	INIT_LIST_HEAD(&timeline->requests);
 
@@ -235,6 +236,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(i915_gem_active_isset(&timeline->barrier));
 
 	i915_syncmap_free(&timeline->sync);
 	hwsp_free(timeline);
@@ -309,6 +311,25 @@ void i915_timeline_unpin(struct i915_timeline *tl)
 	__i915_vma_unpin(tl->hwsp_ggtt);
 }
 
+int i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
+{
+	struct i915_request *old;
+	int err;
+
+	lockdep_assert_held(&rq->i915->drm.struct_mutex);
+
+	/* Must maintain ordering wrt existing barriers */
+	old = i915_gem_active_raw(&tl->barrier, &rq->i915->drm.struct_mutex);
+	if (old) {
+		err = i915_request_await_dma_fence(rq, &old->fence);
+		if (err)
+			return err;
+	}
+
+	i915_gem_active_set(&tl->barrier, rq);
+	return 0;
+}
+
 void __i915_timeline_free(struct kref *kref)
 {
 	struct i915_timeline *timeline =
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 8caeb66d1cd5..d167e04073c5 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -74,6 +74,16 @@ struct i915_timeline {
 	 */
 	struct i915_syncmap *sync;
 
+	/**
+	 * Barrier provides the ability to serialize ordering between different
+	 * timelines.
+	 *
+	 * Users can call i915_timeline_set_barrier which will make all
+	 * subsequent submissions to this timeline be executed only after the
+	 * barrier has been completed.
+	 */
+	struct i915_gem_active barrier;
+
 	struct list_head link;
 	const char *name;
 	struct drm_i915_private *i915;
@@ -155,4 +165,16 @@ void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
 void i915_timelines_fini(struct drm_i915_private *i915);
 
+/**
+ * i915_timeline_set_barrier - orders submission between different timelines
+ * @timeline: timeline to set the barrier on
+ * @rq: request after which new submissions can proceed
+ *
+ * Sets the passed in request as the serialization point for all subsequent
+ * submissions on @timeline. Subsequent requests will not be submitted to GPU
+ * until the barrier has been completed.
+ */
+int i915_timeline_set_barrier(struct i915_timeline *timeline,
+			      struct i915_request *rq);
+
 #endif
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index cf39ccd9fc05..e5659aaa856d 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -15,6 +15,7 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 	spin_lock_init(&timeline->lock);
 
+	init_request_active(&timeline->barrier, NULL);
 	init_request_active(&timeline->last_request, NULL);
 	INIT_LIST_HEAD(&timeline->requests);
 
-- 
cgit v1.2.3


From 21950ee7cc8f13c5350bda0cae22cdb7ac7e3058 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 5 Feb 2019 13:00:05 +0000
Subject: drm/i915: Pull i915_gem_active into the i915_active family

Looking forward, we need to break the struct_mutex dependency on
i915_gem_active. In the meantime, external use of i915_gem_active is
quite beguiling, little do new users suspect that it implies a barrier
as each request it tracks must be ordered wrt the previous one. As one
of many, it can be used to track activity across multiple timelines, a
shared fence, which fits our unordered request submission much better. We
need to steer external users away from the singular, exclusive fence
imposed by i915_gem_active to i915_active instead. As part of that
process, we move i915_gem_active out of i915_request.c into
i915_active.c to start separating the two concepts, and rename it to
i915_active_request (both to tie it to the concept of tracking just one
request, and to give it a longer, less appealing name).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190205130005.2807-5-chris@chris-wilson.co.uk
---
 drivers/gpu/drm/i915/i915_active.c             |  62 ++--
 drivers/gpu/drm/i915/i915_active.h             | 349 ++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_active_types.h       |  16 +-
 drivers/gpu/drm/i915/i915_debugfs.c            |   2 +-
 drivers/gpu/drm/i915/i915_gem.c                |  10 +-
 drivers/gpu/drm/i915/i915_gem_context.c        |  17 +-
 drivers/gpu/drm/i915/i915_gem_context.h        |   2 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c      |   4 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c            |   2 +-
 drivers/gpu/drm/i915/i915_gem_object.h         |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c          |  10 +-
 drivers/gpu/drm/i915/i915_request.c            |  35 +--
 drivers/gpu/drm/i915/i915_request.h            | 383 -------------------------
 drivers/gpu/drm/i915/i915_reset.c              |   2 +-
 drivers/gpu/drm/i915/i915_timeline.c           |  25 +-
 drivers/gpu/drm/i915/i915_timeline.h           |  14 +-
 drivers/gpu/drm/i915/i915_vma.c                |  12 +-
 drivers/gpu/drm/i915/i915_vma.h                |   2 +-
 drivers/gpu/drm/i915/intel_engine_cs.c         |   2 +-
 drivers/gpu/drm/i915/intel_overlay.c           |  33 +--
 drivers/gpu/drm/i915/selftests/mock_timeline.c |   4 +-
 21 files changed, 480 insertions(+), 508 deletions(-)

(limited to 'drivers/gpu/drm/i915/i915_timeline.h')

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 64661c41532b..215b6ff8aa73 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -21,7 +21,7 @@ static struct i915_global_active {
 } global;
 
 struct active_node {
-	struct i915_gem_active base;
+	struct i915_active_request base;
 	struct i915_active *ref;
 	struct rb_node node;
 	u64 timeline;
@@ -33,7 +33,7 @@ __active_park(struct i915_active *ref)
 	struct active_node *it, *n;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		GEM_BUG_ON(i915_active_request_isset(&it->base));
 		kmem_cache_free(global.slab_cache, it);
 	}
 	ref->tree = RB_ROOT;
@@ -53,18 +53,18 @@ __active_retire(struct i915_active *ref)
 }
 
 static void
-node_retire(struct i915_gem_active *base, struct i915_request *rq)
+node_retire(struct i915_active_request *base, struct i915_request *rq)
 {
 	__active_retire(container_of(base, struct active_node, base)->ref);
 }
 
 static void
-last_retire(struct i915_gem_active *base, struct i915_request *rq)
+last_retire(struct i915_active_request *base, struct i915_request *rq)
 {
 	__active_retire(container_of(base, struct i915_active, last));
 }
 
-static struct i915_gem_active *
+static struct i915_active_request *
 active_instance(struct i915_active *ref, u64 idx)
 {
 	struct active_node *node;
@@ -85,7 +85,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	 * twice for the same timeline (as the older rbtree element will be
 	 * retired before the new request added to last).
 	 */
-	old = i915_gem_active_raw(&ref->last, BKL(ref));
+	old = i915_active_request_raw(&ref->last, BKL(ref));
 	if (!old || old->fence.context == idx)
 		goto out;
 
@@ -110,7 +110,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	node = kmem_cache_alloc(global.slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
-	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
+	if (unlikely(!i915_active_request_raw(&ref->last, BKL(ref)))) {
 		kmem_cache_free(global.slab_cache, node);
 		goto out;
 	}
@@ -118,7 +118,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	if (unlikely(!node))
 		return ERR_PTR(-ENOMEM);
 
-	init_request_active(&node->base, node_retire);
+	i915_active_request_init(&node->base, NULL, node_retire);
 	node->ref = ref;
 	node->timeline = idx;
 
@@ -133,7 +133,7 @@ replace:
 	 * callback not two, and so much undo the active counting for the
 	 * overwritten slot.
 	 */
-	if (i915_gem_active_isset(&node->base)) {
+	if (i915_active_request_isset(&node->base)) {
 		/* Retire ourselves from the old rq->active_list */
 		__list_del_entry(&node->base.link);
 		ref->count--;
@@ -154,7 +154,7 @@ void i915_active_init(struct drm_i915_private *i915,
 	ref->i915 = i915;
 	ref->retire = retire;
 	ref->tree = RB_ROOT;
-	init_request_active(&ref->last, last_retire);
+	i915_active_request_init(&ref->last, NULL, last_retire);
 	ref->count = 0;
 }
 
@@ -162,15 +162,15 @@ int i915_active_ref(struct i915_active *ref,
 		    u64 timeline,
 		    struct i915_request *rq)
 {
-	struct i915_gem_active *active;
+	struct i915_active_request *active;
 
 	active = active_instance(ref, timeline);
 	if (IS_ERR(active))
 		return PTR_ERR(active);
 
-	if (!i915_gem_active_isset(active))
+	if (!i915_active_request_isset(active))
 		ref->count++;
-	i915_gem_active_set(active, rq);
+	__i915_active_request_set(active, rq);
 
 	GEM_BUG_ON(!ref->count);
 	return 0;
@@ -196,12 +196,12 @@ int i915_active_wait(struct i915_active *ref)
 	if (i915_active_acquire(ref))
 		goto out_release;
 
-	ret = i915_gem_active_retire(&ref->last, BKL(ref));
+	ret = i915_active_request_retire(&ref->last, BKL(ref));
 	if (ret)
 		goto out_release;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		ret = i915_gem_active_retire(&it->base, BKL(ref));
+		ret = i915_active_request_retire(&it->base, BKL(ref));
 		if (ret)
 			break;
 	}
@@ -211,11 +211,11 @@ out_release:
 	return ret;
 }
 
-static int __i915_request_await_active(struct i915_request *rq,
-				       struct i915_gem_active *active)
+int i915_request_await_active_request(struct i915_request *rq,
+				      struct i915_active_request *active)
 {
 	struct i915_request *barrier =
-		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
+		i915_active_request_raw(active, &rq->i915->drm.struct_mutex);
 
 	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
 }
@@ -225,12 +225,12 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 	struct active_node *it, *n;
 	int ret;
 
-	ret = __i915_request_await_active(rq, &ref->last);
+	ret = i915_request_await_active_request(rq, &ref->last);
 	if (ret)
 		return ret;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		ret = __i915_request_await_active(rq, &it->base);
+		ret = i915_request_await_active_request(rq, &it->base);
 		if (ret)
 			return ret;
 	}
@@ -241,12 +241,32 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
 void i915_active_fini(struct i915_active *ref)
 {
-	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
+	GEM_BUG_ON(i915_active_request_isset(&ref->last));
 	GEM_BUG_ON(!RB_EMPTY_ROOT(&ref->tree));
 	GEM_BUG_ON(ref->count);
 }
 #endif
 
+int i915_active_request_set(struct i915_active_request *active,
+			    struct i915_request *rq)
+{
+	int err;
+
+	/* Must maintain ordering wrt previous active requests */
+	err = i915_request_await_active_request(rq, active);
+	if (err)
+		return err;
+
+	__i915_active_request_set(active, rq);
+	return 0;
+}
+
+void i915_active_retire_noop(struct i915_active_request *active,
+			     struct i915_request *request)
+{
+	/* Space left intentionally blank */
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/i915_active.c"
 #endif
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index 179b47aeec33..12b5c1d287d1 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -7,7 +7,354 @@
 #ifndef _I915_ACTIVE_H_
 #define _I915_ACTIVE_H_
 
+#include <linux/lockdep.h>
+
 #include "i915_active_types.h"
+#include "i915_request.h"
+
+/*
+ * We treat requests as fences. This is not be to confused with our
+ * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
+ * We use the fences to synchronize access from the CPU with activity on the
+ * GPU, for example, we should not rewrite an object's PTE whilst the GPU
+ * is reading them. We also track fences at a higher level to provide
+ * implicit synchronisation around GEM objects, e.g. set-domain will wait
+ * for outstanding GPU rendering before marking the object ready for CPU
+ * access, or a pageflip will wait until the GPU is complete before showing
+ * the frame on the scanout.
+ *
+ * In order to use a fence, the object must track the fence it needs to
+ * serialise with. For example, GEM objects want to track both read and
+ * write access so that we can perform concurrent read operations between
+ * the CPU and GPU engines, as well as waiting for all rendering to
+ * complete, or waiting for the last GPU user of a "fence register". The
+ * object then embeds a #i915_active_request to track the most recent (in
+ * retirement order) request relevant for the desired mode of access.
+ * The #i915_active_request is updated with i915_active_request_set() to
+ * track the most recent fence request, typically this is done as part of
+ * i915_vma_move_to_active().
+ *
+ * When the #i915_active_request completes (is retired), it will
+ * signal its completion to the owner through a callback as well as mark
+ * itself as idle (i915_active_request.request == NULL). The owner
+ * can then perform any action, such as delayed freeing of an active
+ * resource including itself.
+ */
+
+void i915_active_retire_noop(struct i915_active_request *active,
+			     struct i915_request *request);
+
+/**
+ * i915_active_request_init - prepares the activity tracker for use
+ * @active - the active tracker
+ * @rq - initial request to track, can be NULL
+ * @func - a callback when then the tracker is retired (becomes idle),
+ *         can be NULL
+ *
+ * i915_active_request_init() prepares the embedded @active struct for use as
+ * an activity tracker, that is for tracking the last known active request
+ * associated with it. When the last request becomes idle, when it is retired
+ * after completion, the optional callback @func is invoked.
+ */
+static inline void
+i915_active_request_init(struct i915_active_request *active,
+			 struct i915_request *rq,
+			 i915_active_retire_fn retire)
+{
+	RCU_INIT_POINTER(active->request, rq);
+	INIT_LIST_HEAD(&active->link);
+	active->retire = retire ?: i915_active_retire_noop;
+}
+
+#define INIT_ACTIVE_REQUEST(name) i915_active_request_init((name), NULL, NULL)
+
+/**
+ * i915_active_request_set - updates the tracker to watch the current request
+ * @active - the active tracker
+ * @request - the request to watch
+ *
+ * __i915_active_request_set() watches the given @request for completion. Whilst
+ * that @request is busy, the @active reports busy. When that @request is
+ * retired, the @active tracker is updated to report idle.
+ */
+static inline void
+__i915_active_request_set(struct i915_active_request *active,
+			  struct i915_request *request)
+{
+	list_move(&active->link, &request->active_list);
+	rcu_assign_pointer(active->request, request);
+}
+
+int __must_check
+i915_active_request_set(struct i915_active_request *active,
+			struct i915_request *rq);
+
+/**
+ * i915_active_request_set_retire_fn - updates the retirement callback
+ * @active - the active tracker
+ * @fn - the routine called when the request is retired
+ * @mutex - struct_mutex used to guard retirements
+ *
+ * i915_active_request_set_retire_fn() updates the function pointer that
+ * is called when the final request associated with the @active tracker
+ * is retired.
+ */
+static inline void
+i915_active_request_set_retire_fn(struct i915_active_request *active,
+				  i915_active_retire_fn fn,
+				  struct mutex *mutex)
+{
+	lockdep_assert_held(mutex);
+	active->retire = fn ?: i915_active_retire_noop;
+}
+
+static inline struct i915_request *
+__i915_active_request_peek(const struct i915_active_request *active)
+{
+	/*
+	 * Inside the error capture (running with the driver in an unknown
+	 * state), we want to bend the rules slightly (a lot).
+	 *
+	 * Work is in progress to make it safer, in the meantime this keeps
+	 * the known issue from spamming the logs.
+	 */
+	return rcu_dereference_protected(active->request, 1);
+}
+
+/**
+ * i915_active_request_raw - return the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_raw() returns the current request being tracked, or NULL.
+ * It does not obtain a reference on the request for the caller, so the caller
+ * must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_raw(const struct i915_active_request *active,
+			struct mutex *mutex)
+{
+	return rcu_dereference_protected(active->request,
+					 lockdep_is_held(mutex));
+}
+
+/**
+ * i915_active_request_peek - report the active request being monitored
+ * @active - the active tracker
+ *
+ * i915_active_request_peek() returns the current request being tracked if
+ * still active, or NULL. It does not obtain a reference on the request
+ * for the caller, so the caller must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_peek(const struct i915_active_request *active,
+			 struct mutex *mutex)
+{
+	struct i915_request *request;
+
+	request = i915_active_request_raw(active, mutex);
+	if (!request || i915_request_completed(request))
+		return NULL;
+
+	return request;
+}
+
+/**
+ * i915_active_request_get - return a reference to the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_get() returns a reference to the active request, or NULL
+ * if the active tracker is idle. The caller must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_get(const struct i915_active_request *active,
+			struct mutex *mutex)
+{
+	return i915_request_get(i915_active_request_peek(active, mutex));
+}
+
+/**
+ * __i915_active_request_get_rcu - return a reference to the active request
+ * @active - the active tracker
+ *
+ * __i915_active_request_get() returns a reference to the active request,
+ * or NULL if the active tracker is idle. The caller must hold the RCU read
+ * lock, but the returned pointer is safe to use outside of RCU.
+ */
+static inline struct i915_request *
+__i915_active_request_get_rcu(const struct i915_active_request *active)
+{
+	/*
+	 * Performing a lockless retrieval of the active request is super
+	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
+	 * slab of request objects will not be freed whilst we hold the
+	 * RCU read lock. It does not guarantee that the request itself
+	 * will not be freed and then *reused*. Viz,
+	 *
+	 * Thread A			Thread B
+	 *
+	 * rq = active.request
+	 *				retire(rq) -> free(rq);
+	 *				(rq is now first on the slab freelist)
+	 *				active.request = NULL
+	 *
+	 *				rq = new submission on a new object
+	 * ref(rq)
+	 *
+	 * To prevent the request from being reused whilst the caller
+	 * uses it, we take a reference like normal. Whilst acquiring
+	 * the reference we check that it is not in a destroyed state
+	 * (refcnt == 0). That prevents the request being reallocated
+	 * whilst the caller holds on to it. To check that the request
+	 * was not reallocated as we acquired the reference we have to
+	 * check that our request remains the active request across
+	 * the lookup, in the same manner as a seqlock. The visibility
+	 * of the pointer versus the reference counting is controlled
+	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
+	 *
+	 * In the middle of all that, we inspect whether the request is
+	 * complete. Retiring is lazy so the request may be completed long
+	 * before the active tracker is updated. Querying whether the
+	 * request is complete is far cheaper (as it involves no locked
+	 * instructions setting cachelines to exclusive) than acquiring
+	 * the reference, so we do it first. The RCU read lock ensures the
+	 * pointer dereference is valid, but does not ensure that the
+	 * seqno nor HWS is the right one! However, if the request was
+	 * reallocated, that means the active tracker's request was complete.
+	 * If the new request is also complete, then both are and we can
+	 * just report the active tracker is idle. If the new request is
+	 * incomplete, then we acquire a reference on it and check that
+	 * it remained the active request.
+	 *
+	 * It is then imperative that we do not zero the request on
+	 * reallocation, so that we can chase the dangling pointers!
+	 * See i915_request_alloc().
+	 */
+	do {
+		struct i915_request *request;
+
+		request = rcu_dereference(active->request);
+		if (!request || i915_request_completed(request))
+			return NULL;
+
+		/*
+		 * An especially silly compiler could decide to recompute the
+		 * result of i915_request_completed, more specifically
+		 * re-emit the load for request->fence.seqno. A race would catch
+		 * a later seqno value, which could flip the result from true to
+		 * false. Which means part of the instructions below might not
+		 * be executed, while later on instructions are executed. Due to
+		 * barriers within the refcounting the inconsistency can't reach
+		 * past the call to i915_request_get_rcu, but not executing
+		 * that while still executing i915_request_put() creates
+		 * havoc enough.  Prevent this with a compiler barrier.
+		 */
+		barrier();
+
+		request = i915_request_get_rcu(request);
+
+		/*
+		 * What stops the following rcu_access_pointer() from occurring
+		 * before the above i915_request_get_rcu()? If we were
+		 * to read the value before pausing to get the reference to
+		 * the request, we may not notice a change in the active
+		 * tracker.
+		 *
+		 * The rcu_access_pointer() is a mere compiler barrier, which
+		 * means both the CPU and compiler are free to perform the
+		 * memory read without constraint. The compiler only has to
+		 * ensure that any operations after the rcu_access_pointer()
+		 * occur afterwards in program order. This means the read may
+		 * be performed earlier by an out-of-order CPU, or adventurous
+		 * compiler.
+		 *
+		 * The atomic operation at the heart of
+		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
+		 * atomic_inc_not_zero() which is only a full memory barrier
+		 * when successful. That is, if i915_request_get_rcu()
+		 * returns the request (and so with the reference counted
+		 * incremented) then the following read for rcu_access_pointer()
+		 * must occur after the atomic operation and so confirm
+		 * that this request is the one currently being tracked.
+		 *
+		 * The corresponding write barrier is part of
+		 * rcu_assign_pointer().
+		 */
+		if (!request || request == rcu_access_pointer(active->request))
+			return rcu_pointer_handoff(request);
+
+		i915_request_put(request);
+	} while (1);
+}
+
+/**
+ * i915_active_request_get_unlocked - return a reference to the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_get_unlocked() returns a reference to the active request,
+ * or NULL if the active tracker is idle. The reference is obtained under RCU,
+ * so no locking is required by the caller.
+ *
+ * The reference should be freed with i915_request_put().
+ */
+static inline struct i915_request *
+i915_active_request_get_unlocked(const struct i915_active_request *active)
+{
+	struct i915_request *request;
+
+	rcu_read_lock();
+	request = __i915_active_request_get_rcu(active);
+	rcu_read_unlock();
+
+	return request;
+}
+
+/**
+ * i915_active_request_isset - report whether the active tracker is assigned
+ * @active - the active tracker
+ *
+ * i915_active_request_isset() returns true if the active tracker is currently
+ * assigned to a request. Due to the lazy retiring, that request may be idle
+ * and this may report stale information.
+ */
+static inline bool
+i915_active_request_isset(const struct i915_active_request *active)
+{
+	return rcu_access_pointer(active->request);
+}
+
+/**
+ * i915_active_request_retire - waits until the request is retired
+ * @active - the active request on which to wait
+ *
+ * i915_active_request_retire() waits until the request is completed,
+ * and then ensures that at least the retirement handler for this
+ * @active tracker is called before returning. If the @active
+ * tracker is idle, the function returns immediately.
+ */
+static inline int __must_check
+i915_active_request_retire(struct i915_active_request *active,
+			   struct mutex *mutex)
+{
+	struct i915_request *request;
+	long ret;
+
+	request = i915_active_request_raw(active, mutex);
+	if (!request)
+		return 0;
+
+	ret = i915_request_wait(request,
+				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
+				MAX_SCHEDULE_TIMEOUT);
+	if (ret < 0)
+		return ret;
+
+	list_del_init(&active->link);
+	RCU_INIT_POINTER(active->request, NULL);
+
+	active->retire(active, request);
+
+	return 0;
+}
 
 /*
  * GPU activity tracking
@@ -47,6 +394,8 @@ int i915_active_wait(struct i915_active *ref);
 
 int i915_request_await_active(struct i915_request *rq,
 			      struct i915_active *ref);
+int i915_request_await_active_request(struct i915_request *rq,
+				      struct i915_active_request *active);
 
 bool i915_active_acquire(struct i915_active *ref);
 
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index 411e502ed8dd..b679253b53a5 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -8,16 +8,26 @@
 #define _I915_ACTIVE_TYPES_H_
 
 #include <linux/rbtree.h>
-
-#include "i915_request.h"
+#include <linux/rcupdate.h>
 
 struct drm_i915_private;
+struct i915_active_request;
+struct i915_request;
+
+typedef void (*i915_active_retire_fn)(struct i915_active_request *,
+				      struct i915_request *);
+
+struct i915_active_request {
+	struct i915_request __rcu *request;
+	struct list_head link;
+	i915_active_retire_fn retire;
+};
 
 struct i915_active {
 	struct drm_i915_private *i915;
 
 	struct rb_root tree;
-	struct i915_gem_active last;
+	struct i915_active_request last;
 	unsigned int count;
 
 	void (*retire)(struct i915_active *ref);
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index bf3073e63af8..c48733a15e63 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -206,7 +206,7 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		if (vma->fence)
 			seq_printf(m, " , fence: %d%s",
 				   vma->fence->id,
-				   i915_gem_active_isset(&vma->last_fence) ? "*" : "");
+				   i915_active_request_isset(&vma->last_fence) ? "*" : "");
 		seq_puts(m, ")");
 	}
 	if (obj->stolen)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d92e7ab0005e..52b5a24be42b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3018,7 +3018,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
 
 	GEM_BUG_ON(i915->gt.active_requests);
 	for_each_engine(engine, i915, id) {
-		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
+		GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
 		GEM_BUG_ON(engine->last_retired_context !=
 			   to_intel_context(i915->kernel_context, engine));
 	}
@@ -3264,7 +3264,7 @@ wait_for_timelines(struct drm_i915_private *i915,
 	list_for_each_entry(tl, &gt->active_list, link) {
 		struct i915_request *rq;
 
-		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
@@ -4165,7 +4165,8 @@ out:
 }
 
 static void
-frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
+frontbuffer_retire(struct i915_active_request *active,
+		   struct i915_request *request)
 {
 	struct drm_i915_gem_object *obj =
 		container_of(active, typeof(*obj), frontbuffer_write);
@@ -4192,7 +4193,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 	obj->resv = &obj->__builtin_resv;
 
 	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
-	init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
+	i915_active_request_init(&obj->frontbuffer_write,
+				 NULL, frontbuffer_retire);
 
 	obj->mm.madv = I915_MADV_WILLNEED;
 	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 93ab287f44b6..280813a4bf82 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -322,7 +322,7 @@ static u32 default_desc_template(const struct drm_i915_private *i915,
 	return desc;
 }
 
-static void intel_context_retire(struct i915_gem_active *active,
+static void intel_context_retire(struct i915_active_request *active,
 				 struct i915_request *rq)
 {
 	struct intel_context *ce =
@@ -344,7 +344,8 @@ intel_context_init(struct intel_context *ce,
 	/* Use the whole device by default */
 	ce->sseu = intel_device_default_sseu(ctx->i915);
 
-	init_request_active(&ce->active_tracker, intel_context_retire);
+	i915_active_request_init(&ce->active_tracker,
+				 NULL, intel_context_retire);
 }
 
 static struct i915_gem_context *
@@ -668,8 +669,8 @@ last_request_on_engine(struct i915_timeline *timeline,
 
 	GEM_BUG_ON(timeline == &engine->timeline);
 
-	rq = i915_gem_active_raw(&timeline->last_request,
-				 &engine->i915->drm.struct_mutex);
+	rq = i915_active_request_raw(&timeline->last_request,
+				     &engine->i915->drm.struct_mutex);
 	if (rq && rq->engine == engine) {
 		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
 			  timeline->name, engine->name,
@@ -1015,8 +1016,8 @@ gen8_modify_rpcs_gpu(struct intel_context *ce,
 	}
 
 	/* Queue this switch after all other activity by this context. */
-	prev = i915_gem_active_raw(&ce->ring->timeline->last_request,
-				   &i915->drm.struct_mutex);
+	prev = i915_active_request_raw(&ce->ring->timeline->last_request,
+				       &i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
 		ret = i915_request_await_dma_fence(rq, &prev->fence);
 		if (ret < 0)
@@ -1039,9 +1040,9 @@ gen8_modify_rpcs_gpu(struct intel_context *ce,
 	 * But we only need to take one pin on the account of it. Or in other
 	 * words transfer the pinned ce object to tracked active request.
 	 */
-	if (!i915_gem_active_isset(&ce->active_tracker))
+	if (!i915_active_request_isset(&ce->active_tracker))
 		__intel_context_pin(ce);
-	i915_gem_active_set(&ce->active_tracker, rq);
+	__i915_active_request_set(&ce->active_tracker, rq);
 
 out_add:
 	i915_request_add(rq);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 92ad5272e57f..ca150a764c24 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -187,7 +187,7 @@ struct i915_gem_context {
 		 * active_tracker: Active tracker for the external rq activity
 		 * on this intel_context object.
 		 */
-		struct i915_gem_active active_tracker;
+		struct i915_active_request active_tracker;
 
 		const struct intel_context_ops *ops;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 46e259661294..e037e94792f3 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -223,7 +223,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 			 i915_gem_object_get_tiling(vma->obj)))
 			return -EINVAL;
 
-		ret = i915_gem_active_retire(&vma->last_fence,
+		ret = i915_active_request_retire(&vma->last_fence,
 					     &vma->obj->base.dev->struct_mutex);
 		if (ret)
 			return ret;
@@ -232,7 +232,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 	if (fence->vma) {
 		struct i915_vma *old = fence->vma;
 
-		ret = i915_gem_active_retire(&old->last_fence,
+		ret = i915_active_request_retire(&old->last_fence,
 					     &old->obj->base.dev->struct_mutex);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e625659c03a2..d646d37eec2f 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1918,7 +1918,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 		return ERR_PTR(-ENOMEM);
 
 	i915_active_init(i915, &vma->active, NULL);
-	init_request_active(&vma->last_fence, NULL);
+	INIT_ACTIVE_REQUEST(&vma->last_fence);
 
 	vma->vm = &ggtt->vm;
 	vma->ops = &pd_vma_ops;
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 73fec917d097..fab040331cdb 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -175,7 +175,7 @@ struct drm_i915_gem_object {
 
 	atomic_t frontbuffer_bits;
 	unsigned int frontbuffer_ggtt_origin; /* write once */
-	struct i915_gem_active frontbuffer_write;
+	struct i915_active_request frontbuffer_write;
 
 	/** Current tiling stride for the object, if it's tiled. */
 	unsigned int tiling_and_stride;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 6e2e5ed2bd0a..9a65341fec09 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1062,23 +1062,23 @@ i915_error_object_create(struct drm_i915_private *i915,
 }
 
 /* The error capture is special as tries to run underneath the normal
- * locking rules - so we use the raw version of the i915_gem_active lookup.
+ * locking rules - so we use the raw version of the i915_active_request lookup.
  */
 static inline u32
-__active_get_seqno(struct i915_gem_active *active)
+__active_get_seqno(struct i915_active_request *active)
 {
 	struct i915_request *request;
 
-	request = __i915_gem_active_peek(active);
+	request = __i915_active_request_peek(active);
 	return request ? request->global_seqno : 0;
 }
 
 static inline int
-__active_get_engine_id(struct i915_gem_active *active)
+__active_get_engine_id(struct i915_active_request *active)
 {
 	struct i915_request *request;
 
-	request = __i915_gem_active_peek(active);
+	request = __i915_active_request_peek(active);
 	return request ? request->engine->id : -1;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 6512630b59b8..c2a5c48c7541 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,6 +29,7 @@
 #include <linux/sched/signal.h>
 
 #include "i915_drv.h"
+#include "i915_active.h"
 #include "i915_reset.h"
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -125,12 +126,6 @@ static void unreserve_gt(struct drm_i915_private *i915)
 		i915_gem_park(i915);
 }
 
-void i915_gem_retire_noop(struct i915_gem_active *active,
-			  struct i915_request *request)
-{
-	/* Space left intentionally blank */
-}
-
 static void advance_ring(struct i915_request *request)
 {
 	struct intel_ring *ring = request->ring;
@@ -244,7 +239,7 @@ static void __retire_engine_upto(struct intel_engine_cs *engine,
 
 static void i915_request_retire(struct i915_request *request)
 {
-	struct i915_gem_active *active, *next;
+	struct i915_active_request *active, *next;
 
 	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
 		  request->engine->name,
@@ -278,10 +273,10 @@ static void i915_request_retire(struct i915_request *request)
 		 * we may spend an inordinate amount of time simply handling
 		 * the retirement of requests and processing their callbacks.
 		 * Of which, this loop itself is particularly hot due to the
-		 * cache misses when jumping around the list of i915_gem_active.
-		 * So we try to keep this loop as streamlined as possible and
-		 * also prefetch the next i915_gem_active to try and hide
-		 * the likely cache miss.
+		 * cache misses when jumping around the list of
+		 * i915_active_request.  So we try to keep this loop as
+		 * streamlined as possible and also prefetch the next
+		 * i915_active_request to try and hide the likely cache miss.
 		 */
 		prefetchw(next);
 
@@ -526,17 +521,9 @@ out:
 	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
 }
 
-static int add_barrier(struct i915_request *rq, struct i915_gem_active *active)
-{
-	struct i915_request *barrier =
-		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
-
-	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
-}
-
 static int add_timeline_barrier(struct i915_request *rq)
 {
-	return add_barrier(rq, &rq->timeline->barrier);
+	return i915_request_await_active_request(rq, &rq->timeline->barrier);
 }
 
 /**
@@ -595,7 +582,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * We use RCU to look up requests in flight. The lookups may
 	 * race with the request being allocated from the slab freelist.
 	 * That is the request we are writing to here, may be in the process
-	 * of being read by __i915_gem_active_get_rcu(). As such,
+	 * of being read by __i915_active_request_get_rcu(). As such,
 	 * we have to be very careful when overwriting the contents. During
 	 * the RCU lookup, we change chase the request->engine pointer,
 	 * read the request->global_seqno and increment the reference count.
@@ -937,8 +924,8 @@ void i915_request_add(struct i915_request *request)
 	 * see a more recent value in the hws than we are tracking.
 	 */
 
-	prev = i915_gem_active_raw(&timeline->last_request,
-				   &request->i915->drm.struct_mutex);
+	prev = i915_active_request_raw(&timeline->last_request,
+				       &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
@@ -954,7 +941,7 @@ void i915_request_add(struct i915_request *request)
 	spin_unlock_irq(&timeline->lock);
 
 	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
-	i915_gem_active_set(&timeline->last_request, request);
+	__i915_active_request_set(&timeline->last_request, request);
 
 	list_add_tail(&request->ring_link, &ring->request_list);
 	if (list_is_first(&request->ring_link, &ring->request_list)) {
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 3cffb96203b9..40f3e8dcbdd5 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -403,387 +403,4 @@ static inline void i915_request_mark_complete(struct i915_request *rq)
 
 void i915_retire_requests(struct drm_i915_private *i915);
 
-/*
- * We treat requests as fences. This is not be to confused with our
- * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
- * We use the fences to synchronize access from the CPU with activity on the
- * GPU, for example, we should not rewrite an object's PTE whilst the GPU
- * is reading them. We also track fences at a higher level to provide
- * implicit synchronisation around GEM objects, e.g. set-domain will wait
- * for outstanding GPU rendering before marking the object ready for CPU
- * access, or a pageflip will wait until the GPU is complete before showing
- * the frame on the scanout.
- *
- * In order to use a fence, the object must track the fence it needs to
- * serialise with. For example, GEM objects want to track both read and
- * write access so that we can perform concurrent read operations between
- * the CPU and GPU engines, as well as waiting for all rendering to
- * complete, or waiting for the last GPU user of a "fence register". The
- * object then embeds a #i915_gem_active to track the most recent (in
- * retirement order) request relevant for the desired mode of access.
- * The #i915_gem_active is updated with i915_gem_active_set() to track the
- * most recent fence request, typically this is done as part of
- * i915_vma_move_to_active().
- *
- * When the #i915_gem_active completes (is retired), it will
- * signal its completion to the owner through a callback as well as mark
- * itself as idle (i915_gem_active.request == NULL). The owner
- * can then perform any action, such as delayed freeing of an active
- * resource including itself.
- */
-struct i915_gem_active;
-
-typedef void (*i915_gem_retire_fn)(struct i915_gem_active *,
-				   struct i915_request *);
-
-struct i915_gem_active {
-	struct i915_request __rcu *request;
-	struct list_head link;
-	i915_gem_retire_fn retire;
-};
-
-void i915_gem_retire_noop(struct i915_gem_active *,
-			  struct i915_request *request);
-
-/**
- * init_request_active - prepares the activity tracker for use
- * @active - the active tracker
- * @func - a callback when then the tracker is retired (becomes idle),
- *         can be NULL
- *
- * init_request_active() prepares the embedded @active struct for use as
- * an activity tracker, that is for tracking the last known active request
- * associated with it. When the last request becomes idle, when it is retired
- * after completion, the optional callback @func is invoked.
- */
-static inline void
-init_request_active(struct i915_gem_active *active,
-		    i915_gem_retire_fn retire)
-{
-	RCU_INIT_POINTER(active->request, NULL);
-	INIT_LIST_HEAD(&active->link);
-	active->retire = retire ?: i915_gem_retire_noop;
-}
-
-/**
- * i915_gem_active_set - updates the tracker to watch the current request
- * @active - the active tracker
- * @request - the request to watch
- *
- * i915_gem_active_set() watches the given @request for completion. Whilst
- * that @request is busy, the @active reports busy. When that @request is
- * retired, the @active tracker is updated to report idle.
- */
-static inline void
-i915_gem_active_set(struct i915_gem_active *active,
-		    struct i915_request *request)
-{
-	list_move(&active->link, &request->active_list);
-	rcu_assign_pointer(active->request, request);
-}
-
-/**
- * i915_gem_active_set_retire_fn - updates the retirement callback
- * @active - the active tracker
- * @fn - the routine called when the request is retired
- * @mutex - struct_mutex used to guard retirements
- *
- * i915_gem_active_set_retire_fn() updates the function pointer that
- * is called when the final request associated with the @active tracker
- * is retired.
- */
-static inline void
-i915_gem_active_set_retire_fn(struct i915_gem_active *active,
-			      i915_gem_retire_fn fn,
-			      struct mutex *mutex)
-{
-	lockdep_assert_held(mutex);
-	active->retire = fn ?: i915_gem_retire_noop;
-}
-
-static inline struct i915_request *
-__i915_gem_active_peek(const struct i915_gem_active *active)
-{
-	/*
-	 * Inside the error capture (running with the driver in an unknown
-	 * state), we want to bend the rules slightly (a lot).
-	 *
-	 * Work is in progress to make it safer, in the meantime this keeps
-	 * the known issue from spamming the logs.
-	 */
-	return rcu_dereference_protected(active->request, 1);
-}
-
-/**
- * i915_gem_active_raw - return the active request
- * @active - the active tracker
- *
- * i915_gem_active_raw() returns the current request being tracked, or NULL.
- * It does not obtain a reference on the request for the caller, so the caller
- * must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_raw(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	return rcu_dereference_protected(active->request,
-					 lockdep_is_held(mutex));
-}
-
-/**
- * i915_gem_active_peek - report the active request being monitored
- * @active - the active tracker
- *
- * i915_gem_active_peek() returns the current request being tracked if
- * still active, or NULL. It does not obtain a reference on the request
- * for the caller, so the caller must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	struct i915_request *request;
-
-	request = i915_gem_active_raw(active, mutex);
-	if (!request || i915_request_completed(request))
-		return NULL;
-
-	return request;
-}
-
-/**
- * i915_gem_active_get - return a reference to the active request
- * @active - the active tracker
- *
- * i915_gem_active_get() returns a reference to the active request, or NULL
- * if the active tracker is idle. The caller must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_get(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	return i915_request_get(i915_gem_active_peek(active, mutex));
-}
-
-/**
- * __i915_gem_active_get_rcu - return a reference to the active request
- * @active - the active tracker
- *
- * __i915_gem_active_get() returns a reference to the active request, or NULL
- * if the active tracker is idle. The caller must hold the RCU read lock, but
- * the returned pointer is safe to use outside of RCU.
- */
-static inline struct i915_request *
-__i915_gem_active_get_rcu(const struct i915_gem_active *active)
-{
-	/*
-	 * Performing a lockless retrieval of the active request is super
-	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
-	 * slab of request objects will not be freed whilst we hold the
-	 * RCU read lock. It does not guarantee that the request itself
-	 * will not be freed and then *reused*. Viz,
-	 *
-	 * Thread A			Thread B
-	 *
-	 * rq = active.request
-	 *				retire(rq) -> free(rq);
-	 *				(rq is now first on the slab freelist)
-	 *				active.request = NULL
-	 *
-	 *				rq = new submission on a new object
-	 * ref(rq)
-	 *
-	 * To prevent the request from being reused whilst the caller
-	 * uses it, we take a reference like normal. Whilst acquiring
-	 * the reference we check that it is not in a destroyed state
-	 * (refcnt == 0). That prevents the request being reallocated
-	 * whilst the caller holds on to it. To check that the request
-	 * was not reallocated as we acquired the reference we have to
-	 * check that our request remains the active request across
-	 * the lookup, in the same manner as a seqlock. The visibility
-	 * of the pointer versus the reference counting is controlled
-	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
-	 *
-	 * In the middle of all that, we inspect whether the request is
-	 * complete. Retiring is lazy so the request may be completed long
-	 * before the active tracker is updated. Querying whether the
-	 * request is complete is far cheaper (as it involves no locked
-	 * instructions setting cachelines to exclusive) than acquiring
-	 * the reference, so we do it first. The RCU read lock ensures the
-	 * pointer dereference is valid, but does not ensure that the
-	 * seqno nor HWS is the right one! However, if the request was
-	 * reallocated, that means the active tracker's request was complete.
-	 * If the new request is also complete, then both are and we can
-	 * just report the active tracker is idle. If the new request is
-	 * incomplete, then we acquire a reference on it and check that
-	 * it remained the active request.
-	 *
-	 * It is then imperative that we do not zero the request on
-	 * reallocation, so that we can chase the dangling pointers!
-	 * See i915_request_alloc().
-	 */
-	do {
-		struct i915_request *request;
-
-		request = rcu_dereference(active->request);
-		if (!request || i915_request_completed(request))
-			return NULL;
-
-		/*
-		 * An especially silly compiler could decide to recompute the
-		 * result of i915_request_completed, more specifically
-		 * re-emit the load for request->fence.seqno. A race would catch
-		 * a later seqno value, which could flip the result from true to
-		 * false. Which means part of the instructions below might not
-		 * be executed, while later on instructions are executed. Due to
-		 * barriers within the refcounting the inconsistency can't reach
-		 * past the call to i915_request_get_rcu, but not executing
-		 * that while still executing i915_request_put() creates
-		 * havoc enough.  Prevent this with a compiler barrier.
-		 */
-		barrier();
-
-		request = i915_request_get_rcu(request);
-
-		/*
-		 * What stops the following rcu_access_pointer() from occurring
-		 * before the above i915_request_get_rcu()? If we were
-		 * to read the value before pausing to get the reference to
-		 * the request, we may not notice a change in the active
-		 * tracker.
-		 *
-		 * The rcu_access_pointer() is a mere compiler barrier, which
-		 * means both the CPU and compiler are free to perform the
-		 * memory read without constraint. The compiler only has to
-		 * ensure that any operations after the rcu_access_pointer()
-		 * occur afterwards in program order. This means the read may
-		 * be performed earlier by an out-of-order CPU, or adventurous
-		 * compiler.
-		 *
-		 * The atomic operation at the heart of
-		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
-		 * atomic_inc_not_zero() which is only a full memory barrier
-		 * when successful. That is, if i915_request_get_rcu()
-		 * returns the request (and so with the reference counted
-		 * incremented) then the following read for rcu_access_pointer()
-		 * must occur after the atomic operation and so confirm
-		 * that this request is the one currently being tracked.
-		 *
-		 * The corresponding write barrier is part of
-		 * rcu_assign_pointer().
-		 */
-		if (!request || request == rcu_access_pointer(active->request))
-			return rcu_pointer_handoff(request);
-
-		i915_request_put(request);
-	} while (1);
-}
-
-/**
- * i915_gem_active_get_unlocked - return a reference to the active request
- * @active - the active tracker
- *
- * i915_gem_active_get_unlocked() returns a reference to the active request,
- * or NULL if the active tracker is idle. The reference is obtained under RCU,
- * so no locking is required by the caller.
- *
- * The reference should be freed with i915_request_put().
- */
-static inline struct i915_request *
-i915_gem_active_get_unlocked(const struct i915_gem_active *active)
-{
-	struct i915_request *request;
-
-	rcu_read_lock();
-	request = __i915_gem_active_get_rcu(active);
-	rcu_read_unlock();
-
-	return request;
-}
-
-/**
- * i915_gem_active_isset - report whether the active tracker is assigned
- * @active - the active tracker
- *
- * i915_gem_active_isset() returns true if the active tracker is currently
- * assigned to a request. Due to the lazy retiring, that request may be idle
- * and this may report stale information.
- */
-static inline bool
-i915_gem_active_isset(const struct i915_gem_active *active)
-{
-	return rcu_access_pointer(active->request);
-}
-
-/**
- * i915_gem_active_wait - waits until the request is completed
- * @active - the active request on which to wait
- * @flags - how to wait
- * @timeout - how long to wait at most
- * @rps - userspace client to charge for a waitboost
- *
- * i915_gem_active_wait() waits until the request is completed before
- * returning, without requiring any locks to be held. Note that it does not
- * retire any requests before returning.
- *
- * This function relies on RCU in order to acquire the reference to the active
- * request without holding any locks. See __i915_gem_active_get_rcu() for the
- * glory details on how that is managed. Once the reference is acquired, we
- * can then wait upon the request, and afterwards release our reference,
- * free of any locking.
- *
- * This function wraps i915_request_wait(), see it for the full details on
- * the arguments.
- *
- * Returns 0 if successful, or a negative error code.
- */
-static inline int
-i915_gem_active_wait(const struct i915_gem_active *active, unsigned int flags)
-{
-	struct i915_request *request;
-	long ret = 0;
-
-	request = i915_gem_active_get_unlocked(active);
-	if (request) {
-		ret = i915_request_wait(request, flags, MAX_SCHEDULE_TIMEOUT);
-		i915_request_put(request);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-/**
- * i915_gem_active_retire - waits until the request is retired
- * @active - the active request on which to wait
- *
- * i915_gem_active_retire() waits until the request is completed,
- * and then ensures that at least the retirement handler for this
- * @active tracker is called before returning. If the @active
- * tracker is idle, the function returns immediately.
- */
-static inline int __must_check
-i915_gem_active_retire(struct i915_gem_active *active,
-		       struct mutex *mutex)
-{
-	struct i915_request *request;
-	long ret;
-
-	request = i915_gem_active_raw(active, mutex);
-	if (!request)
-		return 0;
-
-	ret = i915_request_wait(request,
-				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
-				MAX_SCHEDULE_TIMEOUT);
-	if (ret < 0)
-		return ret;
-
-	list_del_init(&active->link);
-	RCU_INIT_POINTER(active->request, NULL);
-
-	active->retire(active, request);
-
-	return 0;
-}
-
-#define for_each_active(mask, idx) \
-	for (; mask ? idx = ffs(mask) - 1, 1 : 0; mask &= ~BIT(idx))
-
 #endif /* I915_REQUEST_H */
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4462007a681c..0e0ddf2e6815 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -862,7 +862,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		struct i915_request *rq;
 		long timeout;
 
-		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index dcff3ae96683..b2202d2e58a2 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -163,8 +163,8 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	spin_lock_init(&timeline->lock);
 
-	init_request_active(&timeline->barrier, NULL);
-	init_request_active(&timeline->last_request, NULL);
+	INIT_ACTIVE_REQUEST(&timeline->barrier);
+	INIT_ACTIVE_REQUEST(&timeline->last_request);
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
@@ -236,7 +236,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
-	GEM_BUG_ON(i915_gem_active_isset(&timeline->barrier));
+	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
 
 	i915_syncmap_free(&timeline->sync);
 	hwsp_free(timeline);
@@ -311,25 +311,6 @@ void i915_timeline_unpin(struct i915_timeline *tl)
 	__i915_vma_unpin(tl->hwsp_ggtt);
 }
 
-int i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
-{
-	struct i915_request *old;
-	int err;
-
-	lockdep_assert_held(&rq->i915->drm.struct_mutex);
-
-	/* Must maintain ordering wrt existing barriers */
-	old = i915_gem_active_raw(&tl->barrier, &rq->i915->drm.struct_mutex);
-	if (old) {
-		err = i915_request_await_dma_fence(rq, &old->fence);
-		if (err)
-			return err;
-	}
-
-	i915_gem_active_set(&tl->barrier, rq);
-	return 0;
-}
-
 void __i915_timeline_free(struct kref *kref)
 {
 	struct i915_timeline *timeline =
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index d167e04073c5..7bec7d2e45bf 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/kref.h>
 
+#include "i915_active.h"
 #include "i915_request.h"
 #include "i915_syncmap.h"
 #include "i915_utils.h"
@@ -58,10 +59,10 @@ struct i915_timeline {
 
 	/* Contains an RCU guarded pointer to the last request. No reference is
 	 * held to the request, users must carefully acquire a reference to
-	 * the request using i915_gem_active_get_request_rcu(), or hold the
+	 * the request using i915_active_request_get_request_rcu(), or hold the
 	 * struct_mutex.
 	 */
-	struct i915_gem_active last_request;
+	struct i915_active_request last_request;
 
 	/**
 	 * We track the most recent seqno that we wait on in every context so
@@ -82,7 +83,7 @@ struct i915_timeline {
 	 * subsequent submissions to this timeline be executed only after the
 	 * barrier has been completed.
 	 */
-	struct i915_gem_active barrier;
+	struct i915_active_request barrier;
 
 	struct list_head link;
 	const char *name;
@@ -174,7 +175,10 @@ void i915_timelines_fini(struct drm_i915_private *i915);
  * submissions on @timeline. Subsequent requests will not be submitted to GPU
  * until the barrier has been completed.
  */
-int i915_timeline_set_barrier(struct i915_timeline *timeline,
-			      struct i915_request *rq);
+static inline int
+i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
+{
+	return i915_active_request_set(&tl->barrier, rq);
+}
 
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d4772061e642..b713bed20c38 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -120,7 +120,7 @@ vma_create(struct drm_i915_gem_object *obj,
 		return ERR_PTR(-ENOMEM);
 
 	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
-	init_request_active(&vma->last_fence, NULL);
+	INIT_ACTIVE_REQUEST(&vma->last_fence);
 
 	vma->vm = vm;
 	vma->ops = &vm->vma_ops;
@@ -808,7 +808,7 @@ static void __i915_vma_destroy(struct i915_vma *vma)
 	GEM_BUG_ON(vma->node.allocated);
 	GEM_BUG_ON(vma->fence);
 
-	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
+	GEM_BUG_ON(i915_active_request_isset(&vma->last_fence));
 
 	mutex_lock(&vma->vm->mutex);
 	list_del(&vma->vm_link);
@@ -942,14 +942,14 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 		obj->write_domain = I915_GEM_DOMAIN_RENDER;
 
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
-			i915_gem_active_set(&obj->frontbuffer_write, rq);
+			__i915_active_request_set(&obj->frontbuffer_write, rq);
 
 		obj->read_domains = 0;
 	}
 	obj->read_domains |= I915_GEM_GPU_DOMAINS;
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
-		i915_gem_active_set(&vma->last_fence, rq);
+		__i915_active_request_set(&vma->last_fence, rq);
 
 	export_fence(vma, rq, flags);
 	return 0;
@@ -986,8 +986,8 @@ int i915_vma_unbind(struct i915_vma *vma)
 		if (ret)
 			goto unpin;
 
-		ret = i915_gem_active_retire(&vma->last_fence,
-					     &vma->vm->i915->drm.struct_mutex);
+		ret = i915_active_request_retire(&vma->last_fence,
+					      &vma->vm->i915->drm.struct_mutex);
 unpin:
 		__i915_vma_unpin(vma);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 3c03d4569481..7c742027f866 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -110,7 +110,7 @@ struct i915_vma {
 #define I915_VMA_GGTT_WRITE	BIT(15)
 
 	struct i915_active active;
-	struct i915_gem_active last_fence;
+	struct i915_active_request last_fence;
 
 	/**
 	 * Support different GGTT views into the same object.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 71c01eb13af1..49fa43ff02ba 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1086,7 +1086,7 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
 	 * the last request that remains in the timeline. When idle, it is
 	 * the last executed context as tracked by retirement.
 	 */
-	rq = __i915_gem_active_peek(&engine->timeline.last_request);
+	rq = __i915_active_request_peek(&engine->timeline.last_request);
 	if (rq)
 		return rq->hw_context == kernel_context;
 	else
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index f68c7975006c..fc2e283d326b 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -184,7 +184,7 @@ struct intel_overlay {
 	struct overlay_registers __iomem *regs;
 	u32 flip_addr;
 	/* flip handling */
-	struct i915_gem_active last_flip;
+	struct i915_active_request last_flip;
 };
 
 static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
@@ -212,23 +212,23 @@ static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
 
 static void intel_overlay_submit_request(struct intel_overlay *overlay,
 					 struct i915_request *rq,
-					 i915_gem_retire_fn retire)
+					 i915_active_retire_fn retire)
 {
-	GEM_BUG_ON(i915_gem_active_peek(&overlay->last_flip,
-					&overlay->i915->drm.struct_mutex));
-	i915_gem_active_set_retire_fn(&overlay->last_flip, retire,
-				      &overlay->i915->drm.struct_mutex);
-	i915_gem_active_set(&overlay->last_flip, rq);
+	GEM_BUG_ON(i915_active_request_peek(&overlay->last_flip,
+					    &overlay->i915->drm.struct_mutex));
+	i915_active_request_set_retire_fn(&overlay->last_flip, retire,
+					  &overlay->i915->drm.struct_mutex);
+	__i915_active_request_set(&overlay->last_flip, rq);
 	i915_request_add(rq);
 }
 
 static int intel_overlay_do_wait_request(struct intel_overlay *overlay,
 					 struct i915_request *rq,
-					 i915_gem_retire_fn retire)
+					 i915_active_retire_fn retire)
 {
 	intel_overlay_submit_request(overlay, rq, retire);
-	return i915_gem_active_retire(&overlay->last_flip,
-				      &overlay->i915->drm.struct_mutex);
+	return i915_active_request_retire(&overlay->last_flip,
+					  &overlay->i915->drm.struct_mutex);
 }
 
 static struct i915_request *alloc_request(struct intel_overlay *overlay)
@@ -349,8 +349,9 @@ static void intel_overlay_release_old_vma(struct intel_overlay *overlay)
 	i915_vma_put(vma);
 }
 
-static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
-					       struct i915_request *rq)
+static void
+intel_overlay_release_old_vid_tail(struct i915_active_request *active,
+				   struct i915_request *rq)
 {
 	struct intel_overlay *overlay =
 		container_of(active, typeof(*overlay), last_flip);
@@ -358,7 +359,7 @@ static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
 	intel_overlay_release_old_vma(overlay);
 }
 
-static void intel_overlay_off_tail(struct i915_gem_active *active,
+static void intel_overlay_off_tail(struct i915_active_request *active,
 				   struct i915_request *rq)
 {
 	struct intel_overlay *overlay =
@@ -421,8 +422,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
  * We have to be careful not to repeat work forever an make forward progess. */
 static int intel_overlay_recover_from_interrupt(struct intel_overlay *overlay)
 {
-	return i915_gem_active_retire(&overlay->last_flip,
-				      &overlay->i915->drm.struct_mutex);
+	return i915_active_request_retire(&overlay->last_flip,
+					  &overlay->i915->drm.struct_mutex);
 }
 
 /* Wait for pending overlay flip and release old frame.
@@ -1355,7 +1356,7 @@ void intel_overlay_setup(struct drm_i915_private *dev_priv)
 	overlay->contrast = 75;
 	overlay->saturation = 146;
 
-	init_request_active(&overlay->last_flip, NULL);
+	INIT_ACTIVE_REQUEST(&overlay->last_flip);
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index e5659aaa856d..d2de9ece2118 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -15,8 +15,8 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 	spin_lock_init(&timeline->lock);
 
-	init_request_active(&timeline->barrier, NULL);
-	init_request_active(&timeline->last_request, NULL);
+	INIT_ACTIVE_REQUEST(&timeline->barrier);
+	INIT_ACTIVE_REQUEST(&timeline->last_request);
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
-- 
cgit v1.2.3