drm/panthor: Recover from panthor_gpu_flush_caches() failures

We have seen a few cases where the whole memory subsystem is blocked and flush operations never complete. When that happens, we want to: - schedule a reset, so we can recover from this situation - in the reset path, we need to reset the pending_reqs so we can send new commands after the reset - if more panthor_gpu_flush_caches() operations are queued after the timeout, we skip them and return -EIO directly to avoid needless waits (the memory block won't miraculously work again) Note that we drop the WARN_ON()s because these hangs can be triggered with buggy GPU jobs created by the UMD, and there's no way we can prevent it. We do keep the error messages though. v2: - New patch v3: - Collect R-b - Explicitly mention the fact we dropped the WARN_ON()s in the commit message v4: - No changes Fixes: 5cd894e258c4 ("drm/panthor: Add the GPU logical block") Reviewed-by: Steven Price <steven.price@arm.com> Link: https://patch.msgid.link/20251128084841.3804658-4-boris.brezillon@collabora.com Signed-off-by: Boris Brezillon <boris.brezillon@collabora.com>
author: Boris Brezillon <boris.brezillon@collabora.com> 2025-11-28 11:48:37 +0300
committer: Boris Brezillon <boris.brezillon@collabora.com> 2025-11-28 12:17:44 +0300
commit: 3c0a60195b37af83bbbaf223cd3a78945bace49e (patch)
tree: f5b81fa2dbf2a4b350fd5e948ba6b95a95f2b150
parent: 151df689fb75e46a6cafa9a2c407d44969f4bebe (diff)
download: linux-3c0a60195b37af83bbbaf223cd3a78945bace49e.tar.xz
1 files changed, 12 insertions, 7 deletions
diff --git a/drivers/gpu/drm/panthor/panthor_gpu.c b/drivers/gpu/drm/panthor/panthor_gpu.c
index 06b231b2460a..9cb5dee93212 100644
--- a/drivers/gpu/drm/panthor/panthor_gpu.c
+++ b/drivers/gpu/drm/panthor/panthor_gpu.c
@@ -289,38 +289,42 @@ int panthor_gpu_l2_power_on(struct panthor_device *ptdev)
 int panthor_gpu_flush_caches(struct panthor_device *ptdev,
 			     u32 l2, u32 lsc, u32 other)
 {
-	bool timedout = false;
 	unsigned long flags;
+	int ret = 0;
 
 	/* Serialize cache flush operations. */
 	guard(mutex)(&ptdev->gpu->cache_flush_lock);
 
 	spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
-	if (!drm_WARN_ON(&ptdev->base,
-			 ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
+	if (!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED)) {
 		ptdev->gpu->pending_reqs |= GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		gpu_write(ptdev, GPU_CMD, GPU_FLUSH_CACHES(l2, lsc, other));
+	} else {
+		ret = -EIO;
 	}
 	spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 
+	if (ret)
+		return ret;
+
 	if (!wait_event_timeout(ptdev->gpu->reqs_acked,
 				!(ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED),
 				msecs_to_jiffies(100))) {
 		spin_lock_irqsave(&ptdev->gpu->reqs_lock, flags);
 		if ((ptdev->gpu->pending_reqs & GPU_IRQ_CLEAN_CACHES_COMPLETED) != 0 &&
 		    !(gpu_read(ptdev, GPU_INT_RAWSTAT) & GPU_IRQ_CLEAN_CACHES_COMPLETED))
-			timedout = true;
+			ret = -ETIMEDOUT;
 		else
 			ptdev->gpu->pending_reqs &= ~GPU_IRQ_CLEAN_CACHES_COMPLETED;
 		spin_unlock_irqrestore(&ptdev->gpu->reqs_lock, flags);
 	}
 
-	if (timedout) {
+	if (ret) {
+		panthor_device_schedule_reset(ptdev);
 		drm_err(&ptdev->base, "Flush caches timeout");
-		return -ETIMEDOUT;
 	}
 
-	return 0;
+	return ret;
 }
 
 /**
@@ -360,6 +364,7 @@ int panthor_gpu_soft_reset(struct panthor_device *ptdev)
 		return -ETIMEDOUT;
 	}
 
+	ptdev->gpu->pending_reqs = 0;
 	return 0;
 }
author	Boris Brezillon <boris.brezillon@collabora.com>	2025-11-28 11:48:37 +0300
committer	Boris Brezillon <boris.brezillon@collabora.com>	2025-11-28 12:17:44 +0300
commit	3c0a60195b37af83bbbaf223cd3a78945bace49e (patch)
tree	f5b81fa2dbf2a4b350fd5e948ba6b95a95f2b150
parent	151df689fb75e46a6cafa9a2c407d44969f4bebe (diff)
download	linux-3c0a60195b37af83bbbaf223cd3a78945bace49e.tar.xz