1 files changed, 137 insertions, 238 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 83599f2a0387..4303b447efe8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -47,9 +47,30 @@
  * that the the relevant GPU caches have been flushed.
  */
 
+struct amdgpu_fence {
+	struct fence base;
+
+	/* RB, DMA, etc. */
+	struct amdgpu_ring		*ring;
+};
+
 static struct kmem_cache *amdgpu_fence_slab;
 static atomic_t amdgpu_fence_slab_ref = ATOMIC_INIT(0);
 
+/*
+ * Cast helper
+ */
+static const struct fence_ops amdgpu_fence_ops;
+static inline struct amdgpu_fence *to_amdgpu_fence(struct fence *f)
+{
+	struct amdgpu_fence *__f = container_of(f, struct amdgpu_fence, base);
+
+	if (__f->base.ops == &amdgpu_fence_ops)
+		return __f;
+
+	return NULL;
+}
+
 /**
  * amdgpu_fence_write - write a fence value
  *
@@ -82,7 +103,7 @@ static u32 amdgpu_fence_read(struct amdgpu_ring *ring)
 	if (drv->cpu_addr)
 		seq = le32_to_cpu(*drv->cpu_addr);
 	else
-		seq = lower_32_bits(atomic64_read(&drv->last_seq));
+		seq = atomic_read(&drv->last_seq);
 
 	return seq;
 }
@@ -100,20 +121,32 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct fence **f)
 {
 	struct amdgpu_device *adev = ring->adev;
 	struct amdgpu_fence *fence;
+	struct fence **ptr;
+	uint32_t seq;
 
 	fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_KERNEL);
 	if (fence == NULL)
 		return -ENOMEM;
 
-	fence->seq = ++ring->fence_drv.sync_seq;
+	seq = ++ring->fence_drv.sync_seq;
 	fence->ring = ring;
 	fence_init(&fence->base, &amdgpu_fence_ops,
-		   &ring->fence_drv.fence_queue.lock,
+		   &ring->fence_drv.lock,
 		   adev->fence_context + ring->idx,
-		   fence->seq);
+		   seq);
 	amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
-			       fence->seq, AMDGPU_FENCE_FLAG_INT);
+			       seq, AMDGPU_FENCE_FLAG_INT);
+
+	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
+	/* This function can't be called concurrently anyway, otherwise
+	 * emitting the fence would mess up the hardware ring buffer.
+	 */
+	BUG_ON(rcu_dereference_protected(*ptr, 1));
+
+	rcu_assign_pointer(*ptr, fence_get(&fence->base));
+
 	*f = &fence->base;
+
 	return 0;
 }
 
@@ -131,89 +164,48 @@ static void amdgpu_fence_schedule_fallback(struct amdgpu_ring *ring)
 }
 
 /**
- * amdgpu_fence_activity - check for fence activity
+ * amdgpu_fence_process - check for fence activity
  *
  * @ring: pointer to struct amdgpu_ring
  *
  * Checks the current fence value and calculates the last
- * signalled fence value. Returns true if activity occured
- * on the ring, and the fence_queue should be waken up.
+ * signalled fence value. Wakes the fence queue if the
+ * sequence number has increased.
  */
-static bool amdgpu_fence_activity(struct amdgpu_ring *ring)
+void amdgpu_fence_process(struct amdgpu_ring *ring)
 {
-	uint64_t seq, last_seq, last_emitted;
-	unsigned count_loop = 0;
-	bool wake = false;
-
-	/* Note there is a scenario here for an infinite loop but it's
-	 * very unlikely to happen. For it to happen, the current polling
-	 * process need to be interrupted by another process and another
-	 * process needs to update the last_seq btw the atomic read and
-	 * xchg of the current process.
-	 *
-	 * More over for this to go in infinite loop there need to be
-	 * continuously new fence signaled ie amdgpu_fence_read needs
-	 * to return a different value each time for both the currently
-	 * polling process and the other process that xchg the last_seq
-	 * btw atomic read and xchg of the current process. And the
-	 * value the other process set as last seq must be higher than
-	 * the seq value we just read. Which means that current process
-	 * need to be interrupted after amdgpu_fence_read and before
-	 * atomic xchg.
-	 *
-	 * To be even more safe we count the number of time we loop and
-	 * we bail after 10 loop just accepting the fact that we might
-	 * have temporarly set the last_seq not to the true real last
-	 * seq but to an older one.
-	 */
-	last_seq = atomic64_read(&ring->fence_drv.last_seq);
+	struct amdgpu_fence_driver *drv = &ring->fence_drv;
+	uint32_t seq, last_seq;
+	int r;
+
 	do {
-		last_emitted = ring->fence_drv.sync_seq;
+		last_seq = atomic_read(&ring->fence_drv.last_seq);
 		seq = amdgpu_fence_read(ring);
-		seq |= last_seq & 0xffffffff00000000LL;
-		if (seq < last_seq) {
-			seq &= 0xffffffff;
-			seq |= last_emitted & 0xffffffff00000000LL;
-		}
 
-		if (seq <= last_seq || seq > last_emitted) {
-			break;
-		}
-		/* If we loop over we don't want to return without
-		 * checking if a fence is signaled as it means that the
-		 * seq we just read is different from the previous on.
-		 */
-		wake = true;
-		last_seq = seq;
-		if ((count_loop++) > 10) {
-			/* We looped over too many time leave with the
-			 * fact that we might have set an older fence
-			 * seq then the current real last seq as signaled
-			 * by the hw.
-			 */
-			break;
-		}
-	} while (atomic64_xchg(&ring->fence_drv.last_seq, seq) > seq);
+	} while (atomic_cmpxchg(&drv->last_seq, last_seq, seq) != last_seq);
 
-	if (seq < last_emitted)
+	if (seq != ring->fence_drv.sync_seq)
 		amdgpu_fence_schedule_fallback(ring);
 
-	return wake;
-}
+	while (last_seq != seq) {
+		struct fence *fence, **ptr;
 
-/**
- * amdgpu_fence_process - process a fence
- *
- * @adev: amdgpu_device pointer
- * @ring: ring index the fence is associated with
- *
- * Checks the current fence value and wakes the fence queue
- * if the sequence number has increased (all asics).
- */
-void amdgpu_fence_process(struct amdgpu_ring *ring)
-{
-	if (amdgpu_fence_activity(ring))
-		wake_up_all(&ring->fence_drv.fence_queue);
+		ptr = &drv->fences[++last_seq & drv->num_fences_mask];
+
+		/* There is always exactly one thread signaling this fence slot */
+		fence = rcu_dereference_protected(*ptr, 1);
+		rcu_assign_pointer(*ptr, NULL);
+
+		BUG_ON(!fence);
+
+		r = fence_signal(fence);
+		if (!r)
+			FENCE_TRACE(fence, "signaled from irq context\n");
+		else
+			BUG();
+
+		fence_put(fence);
+	}
 }
 
 /**
@@ -231,77 +223,6 @@ static void amdgpu_fence_fallback(unsigned long arg)
 }
 
 /**
- * amdgpu_fence_seq_signaled - check if a fence sequence number has signaled
- *
- * @ring: ring the fence is associated with
- * @seq: sequence number
- *
- * Check if the last signaled fence sequnce number is >= the requested
- * sequence number (all asics).
- * Returns true if the fence has signaled (current fence value
- * is >= requested value) or false if it has not (current fence
- * value is < the requested value.  Helper function for
- * amdgpu_fence_signaled().
- */
-static bool amdgpu_fence_seq_signaled(struct amdgpu_ring *ring, u64 seq)
-{
-	if (atomic64_read(&ring->fence_drv.last_seq) >= seq)
-		return true;
-
-	/* poll new last sequence at least once */
-	amdgpu_fence_process(ring);
-	if (atomic64_read(&ring->fence_drv.last_seq) >= seq)
-		return true;
-
-	return false;
-}
-
-/*
- * amdgpu_ring_wait_seq - wait for seq of the specific ring to signal
- * @ring: ring to wait on for the seq number
- * @seq: seq number wait for
- *
- * return value:
- * 0: seq signaled, and gpu not hang
- * -EINVAL: some paramter is not valid
- */
-static int amdgpu_fence_ring_wait_seq(struct amdgpu_ring *ring, uint64_t seq)
-{
-	BUG_ON(!ring);
-	if (seq > ring->fence_drv.sync_seq)
-		return -EINVAL;
-
-	if (atomic64_read(&ring->fence_drv.last_seq) >= seq)
-		return 0;
-
-	amdgpu_fence_schedule_fallback(ring);
-	wait_event(ring->fence_drv.fence_queue,
-		   amdgpu_fence_seq_signaled(ring, seq));
-
-	return 0;
-}
-
-/**
- * amdgpu_fence_wait_next - wait for the next fence to signal
- *
- * @adev: amdgpu device pointer
- * @ring: ring index the fence is associated with
- *
- * Wait for the next fence on the requested ring to signal (all asics).
- * Returns 0 if the next fence has passed, error for all other cases.
- * Caller must hold ring lock.
- */
-int amdgpu_fence_wait_next(struct amdgpu_ring *ring)
-{
-	uint64_t seq = atomic64_read(&ring->fence_drv.last_seq) + 1ULL;
-
-	if (seq >= ring->fence_drv.sync_seq)
-		return -ENOENT;
-
-	return amdgpu_fence_ring_wait_seq(ring, seq);
-}
-
-/**
  * amdgpu_fence_wait_empty - wait for all fences to signal
  *
  * @adev: amdgpu device pointer
@@ -309,16 +230,28 @@ int amdgpu_fence_wait_next(struct amdgpu_ring *ring)
  *
  * Wait for all fences on the requested ring to signal (all asics).
  * Returns 0 if the fences have passed, error for all other cases.
- * Caller must hold ring lock.
  */
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring)
 {
-	uint64_t seq = ring->fence_drv.sync_seq;
+	uint64_t seq = ACCESS_ONCE(ring->fence_drv.sync_seq);
+	struct fence *fence, **ptr;
+	int r;
 
 	if (!seq)
 		return 0;
 
-	return amdgpu_fence_ring_wait_seq(ring, seq);
+	ptr = &ring->fence_drv.fences[seq & ring->fence_drv.num_fences_mask];
+	rcu_read_lock();
+	fence = rcu_dereference(*ptr);
+	if (!fence || !fence_get_rcu(fence)) {
+		rcu_read_unlock();
+		return 0;
+	}
+	rcu_read_unlock();
+
+	r = fence_wait(fence, false);
+	fence_put(fence);
+	return r;
 }
 
 /**
@@ -338,13 +271,10 @@ unsigned amdgpu_fence_count_emitted(struct amdgpu_ring *ring)
 	 * but it's ok to report slightly wrong fence count here.
 	 */
 	amdgpu_fence_process(ring);
-	emitted = ring->fence_drv.sync_seq
-		- atomic64_read(&ring->fence_drv.last_seq);
-	/* to avoid 32bits warp around */
-	if (emitted > 0x10000000)
-		emitted = 0x10000000;
-
-	return (unsigned)emitted;
+	emitted = 0x100000000ull;
+	emitted -= atomic_read(&ring->fence_drv.last_seq);
+	emitted += ACCESS_ONCE(ring->fence_drv.sync_seq);
+	return lower_32_bits(emitted);
 }
 
 /**
@@ -376,7 +306,7 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
 		ring->fence_drv.cpu_addr = adev->uvd.cpu_addr + index;
 		ring->fence_drv.gpu_addr = adev->uvd.gpu_addr + index;
 	}
-	amdgpu_fence_write(ring, atomic64_read(&ring->fence_drv.last_seq));
+	amdgpu_fence_write(ring, atomic_read(&ring->fence_drv.last_seq));
 	amdgpu_irq_get(adev, irq_src, irq_type);
 
 	ring->fence_drv.irq_src = irq_src;
@@ -394,25 +324,36 @@ int amdgpu_fence_driver_start_ring(struct amdgpu_ring *ring,
  * for the requested ring.
  *
  * @ring: ring to init the fence driver on
+ * @num_hw_submission: number of entries on the hardware queue
  *
  * Init the fence driver for the requested ring (all asics).
  * Helper function for amdgpu_fence_driver_init().
  */
-int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
+int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring,
+				  unsigned num_hw_submission)
 {
 	long timeout;
 	int r;
 
+	/* Check that num_hw_submission is a power of two */
+	if ((num_hw_submission & (num_hw_submission - 1)) != 0)
+		return -EINVAL;
+
 	ring->fence_drv.cpu_addr = NULL;
 	ring->fence_drv.gpu_addr = 0;
 	ring->fence_drv.sync_seq = 0;
-	atomic64_set(&ring->fence_drv.last_seq, 0);
+	atomic_set(&ring->fence_drv.last_seq, 0);
 	ring->fence_drv.initialized = false;
 
 	setup_timer(&ring->fence_drv.fallback_timer, amdgpu_fence_fallback,
 		    (unsigned long)ring);
 
-	init_waitqueue_head(&ring->fence_drv.fence_queue);
+	ring->fence_drv.num_fences_mask = num_hw_submission - 1;
+	spin_lock_init(&ring->fence_drv.lock);
+	ring->fence_drv.fences = kcalloc(num_hw_submission, sizeof(void *),
+					 GFP_KERNEL);
+	if (!ring->fence_drv.fences)
+		return -ENOMEM;
 
 	timeout = msecs_to_jiffies(amdgpu_lockup_timeout);
 	if (timeout == 0) {
@@ -426,7 +367,7 @@ int amdgpu_fence_driver_init_ring(struct amdgpu_ring *ring)
 		timeout = MAX_SCHEDULE_TIMEOUT;
 	}
 	r = amd_sched_init(&ring->sched, &amdgpu_sched_ops,
-			   amdgpu_sched_hw_submission,
+			   num_hw_submission,
 			   timeout, ring->name);
 	if (r) {
 		DRM_ERROR("Failed to create scheduler on ring %s.\n",
@@ -474,10 +415,9 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
  */
 void amdgpu_fence_driver_fini(struct amdgpu_device *adev)
 {
-	int i, r;
+	unsigned i, j;
+	int r;
 
-	if (atomic_dec_and_test(&amdgpu_fence_slab_ref))
-		kmem_cache_destroy(amdgpu_fence_slab);
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
 
@@ -488,13 +428,18 @@ void amdgpu_fence_driver_fini(struct amdgpu_device *adev)
 			/* no need to trigger GPU reset as we are unloading */
 			amdgpu_fence_driver_force_completion(adev);
 		}
-		wake_up_all(&ring->fence_drv.fence_queue);
 		amdgpu_irq_put(adev, ring->fence_drv.irq_src,
 			       ring->fence_drv.irq_type);
 		amd_sched_fini(&ring->sched);
 		del_timer_sync(&ring->fence_drv.fallback_timer);
+		for (j = 0; j <= ring->fence_drv.num_fences_mask; ++j)
+			fence_put(ring->fence_drv.fences[i]);
+		kfree(ring->fence_drv.fences);
 		ring->fence_drv.initialized = false;
 	}
+
+	if (atomic_dec_and_test(&amdgpu_fence_slab_ref))
+		kmem_cache_destroy(amdgpu_fence_slab);
 }
 
 /**
@@ -591,103 +536,57 @@ static const char *amdgpu_fence_get_timeline_name(struct fence *f)
 }
 
 /**
- * amdgpu_fence_is_signaled - test if fence is signaled
- *
- * @f: fence to test
+ * amdgpu_fence_enable_signaling - enable signalling on fence
+ * @fence: fence
  *
- * Test the fence sequence number if it is already signaled. If it isn't
- * signaled start fence processing. Returns True if the fence is signaled.
+ * This function is called with fence_queue lock held, and adds a callback
+ * to fence_queue that checks if this fence is signaled, and if so it
+ * signals the fence and removes itself.
  */
-static bool amdgpu_fence_is_signaled(struct fence *f)
+static bool amdgpu_fence_enable_signaling(struct fence *f)
 {
 	struct amdgpu_fence *fence = to_amdgpu_fence(f);
 	struct amdgpu_ring *ring = fence->ring;
 
-	if (atomic64_read(&ring->fence_drv.last_seq) >= fence->seq)
-		return true;
-
-	amdgpu_fence_process(ring);
+	if (!timer_pending(&ring->fence_drv.fallback_timer))
+		amdgpu_fence_schedule_fallback(ring);
 
-	if (atomic64_read(&ring->fence_drv.last_seq) >= fence->seq)
-		return true;
+	FENCE_TRACE(&fence->base, "armed on ring %i!\n", ring->idx);
 
-	return false;
+	return true;
 }
 
 /**
- * amdgpu_fence_check_signaled - callback from fence_queue
+ * amdgpu_fence_free - free up the fence memory
+ *
+ * @rcu: RCU callback head
  *
- * this function is called with fence_queue lock held, which is also used
- * for the fence locking itself, so unlocked variants are used for
- * fence_signal, and remove_wait_queue.
+ * Free up the fence memory after the RCU grace period.
  */
-static int amdgpu_fence_check_signaled(wait_queue_t *wait, unsigned mode, int flags, void *key)
+static void amdgpu_fence_free(struct rcu_head *rcu)
 {
-	struct amdgpu_fence *fence;
-	struct amdgpu_device *adev;
-	u64 seq;
-	int ret;
-
-	fence = container_of(wait, struct amdgpu_fence, fence_wake);
-	adev = fence->ring->adev;
-
-	/*
-	 * We cannot use amdgpu_fence_process here because we're already
-	 * in the waitqueue, in a call from wake_up_all.
-	 */
-	seq = atomic64_read(&fence->ring->fence_drv.last_seq);
-	if (seq >= fence->seq) {
-		ret = fence_signal_locked(&fence->base);
-		if (!ret)
-			FENCE_TRACE(&fence->base, "signaled from irq context\n");
-		else
-			FENCE_TRACE(&fence->base, "was already signaled\n");
-
-		__remove_wait_queue(&fence->ring->fence_drv.fence_queue, &fence->fence_wake);
-		fence_put(&fence->base);
-	} else
-		FENCE_TRACE(&fence->base, "pending\n");
-	return 0;
+	struct fence *f = container_of(rcu, struct fence, rcu);
+	struct amdgpu_fence *fence = to_amdgpu_fence(f);
+	kmem_cache_free(amdgpu_fence_slab, fence);
 }
 
 /**
- * amdgpu_fence_enable_signaling - enable signalling on fence
+ * amdgpu_fence_release - callback that fence can be freed
+ *
  * @fence: fence
  *
- * This function is called with fence_queue lock held, and adds a callback
- * to fence_queue that checks if this fence is signaled, and if so it
- * signals the fence and removes itself.
+ * This function is called when the reference count becomes zero.
+ * It just RCU schedules freeing up the fence.
  */
-static bool amdgpu_fence_enable_signaling(struct fence *f)
-{
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	struct amdgpu_ring *ring = fence->ring;
-
-	if (atomic64_read(&ring->fence_drv.last_seq) >= fence->seq)
-		return false;
-
-	fence->fence_wake.flags = 0;
-	fence->fence_wake.private = NULL;
-	fence->fence_wake.func = amdgpu_fence_check_signaled;
-	__add_wait_queue(&ring->fence_drv.fence_queue, &fence->fence_wake);
-	fence_get(f);
-	if (!timer_pending(&ring->fence_drv.fallback_timer))
-		amdgpu_fence_schedule_fallback(ring);
-	FENCE_TRACE(&fence->base, "armed on ring %i!\n", ring->idx);
-	return true;
-}
-
 static void amdgpu_fence_release(struct fence *f)
 {
-	struct amdgpu_fence *fence = to_amdgpu_fence(f);
-	kmem_cache_free(amdgpu_fence_slab, fence);
+	call_rcu(&f->rcu, amdgpu_fence_free);
 }
 
-const struct fence_ops amdgpu_fence_ops = {
+static const struct fence_ops amdgpu_fence_ops = {
 	.get_driver_name = amdgpu_fence_get_driver_name,
 	.get_timeline_name = amdgpu_fence_get_timeline_name,
 	.enable_signaling = amdgpu_fence_enable_signaling,
-	.signaled = amdgpu_fence_is_signaled,
 	.wait = fence_default_wait,
 	.release = amdgpu_fence_release,
 };
@@ -711,9 +610,9 @@ static int amdgpu_debugfs_fence_info(struct seq_file *m, void *data)
 		amdgpu_fence_process(ring);
 
 		seq_printf(m, "--- ring %d (%s) ---\n", i, ring->name);
-		seq_printf(m, "Last signaled fence 0x%016llx\n",
-			   (unsigned long long)atomic64_read(&ring->fence_drv.last_seq));
-		seq_printf(m, "Last emitted        0x%016llx\n",
+		seq_printf(m, "Last signaled fence 0x%08x\n",
+			   atomic_read(&ring->fence_drv.last_seq));
+		seq_printf(m, "Last emitted        0x%08x\n",
 			   ring->fence_drv.sync_seq);
 	}
 	return 0;