1 files changed, 367 insertions, 107 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c
index 9c505d3517cd..8aeb1789805c 100644
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -8,6 +8,7 @@
 #include <linux/bitfield.h>
 #include <linux/circ_buf.h>
 #include <linux/delay.h>
+#include <linux/fault-inject.h>
 
 #include <kunit/static_stub.h>
 
@@ -17,6 +18,7 @@
 #include "abi/guc_actions_sriov_abi.h"
 #include "abi/guc_klvs_abi.h"
 #include "xe_bo.h"
+#include "xe_devcoredump.h"
 #include "xe_device.h"
 #include "xe_gt.h"
 #include "xe_gt_pagefault.h"
@@ -25,12 +27,48 @@
 #include "xe_gt_sriov_pf_monitor.h"
 #include "xe_gt_tlb_invalidation.h"
 #include "xe_guc.h"
+#include "xe_guc_log.h"
 #include "xe_guc_relay.h"
 #include "xe_guc_submit.h"
 #include "xe_map.h"
 #include "xe_pm.h"
 #include "xe_trace_guc.h"
 
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+enum {
+	/* Internal states, not error conditions */
+	CT_DEAD_STATE_REARM,			/* 0x0001 */
+	CT_DEAD_STATE_CAPTURE,			/* 0x0002 */
+
+	/* Error conditions */
+	CT_DEAD_SETUP,				/* 0x0004 */
+	CT_DEAD_H2G_WRITE,			/* 0x0008 */
+	CT_DEAD_H2G_HAS_ROOM,			/* 0x0010 */
+	CT_DEAD_G2H_READ,			/* 0x0020 */
+	CT_DEAD_G2H_RECV,			/* 0x0040 */
+	CT_DEAD_G2H_RELEASE,			/* 0x0080 */
+	CT_DEAD_DEADLOCK,			/* 0x0100 */
+	CT_DEAD_PROCESS_FAILED,			/* 0x0200 */
+	CT_DEAD_FAST_G2H,			/* 0x0400 */
+	CT_DEAD_PARSE_G2H_RESPONSE,		/* 0x0800 */
+	CT_DEAD_PARSE_G2H_UNKNOWN,		/* 0x1000 */
+	CT_DEAD_PARSE_G2H_ORIGIN,		/* 0x2000 */
+	CT_DEAD_PARSE_G2H_TYPE,			/* 0x4000 */
+};
+
+static void ct_dead_worker_func(struct work_struct *w);
+static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code);
+
+#define CT_DEAD(ct, ctb, reason_code)		ct_dead_capture((ct), (ctb), CT_DEAD_##reason_code)
+#else
+#define CT_DEAD(ct, ctb, reason)			\
+	do {						\
+		struct guc_ctb *_ctb = (ctb);		\
+		if (_ctb)				\
+			_ctb->info.broken = true;	\
+	} while (0)
+#endif
+
 /* Used when a CT send wants to block and / or receive data */
 struct g2h_fence {
 	u32 *response_buffer;
@@ -175,14 +213,18 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
 
 	xe_gt_assert(gt, !(guc_ct_size() % PAGE_SIZE));
 
-	ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", 0);
+	ct->g2h_wq = alloc_ordered_workqueue("xe-g2h-wq", WQ_MEM_RECLAIM);
 	if (!ct->g2h_wq)
 		return -ENOMEM;
 
 	spin_lock_init(&ct->fast_lock);
 	xa_init(&ct->fence_lookup);
 	INIT_WORK(&ct->g2h_worker, g2h_worker_func);
-	INIT_DELAYED_WORK(&ct->safe_mode_worker,  safe_mode_worker_func);
+	INIT_DELAYED_WORK(&ct->safe_mode_worker, safe_mode_worker_func);
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+	spin_lock_init(&ct->dead.lock);
+	INIT_WORK(&ct->dead.worker, ct_dead_worker_func);
+#endif
 	init_waitqueue_head(&ct->wq);
 	init_waitqueue_head(&ct->g2h_fence_wq);
 
@@ -209,6 +251,7 @@ int xe_guc_ct_init(struct xe_guc_ct *ct)
 	ct->state = XE_GUC_CT_STATE_DISABLED;
 	return 0;
 }
+ALLOW_ERROR_INJECTION(xe_guc_ct_init, ERRNO); /* See xe_pci_probe() */
 
 #define desc_read(xe_, guc_ctb__, field_)			\
 	xe_map_rd_field(xe_, &guc_ctb__->desc, 0,		\
@@ -395,6 +438,7 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
 
 	xe_gt_assert(gt, !xe_guc_ct_enabled(ct));
 
+	xe_map_memset(xe, &ct->bo->vmap, 0, 0, ct->bo->size);
 	guc_ct_ctb_h2g_init(xe, &ct->ctbs.h2g, &ct->bo->vmap);
 	guc_ct_ctb_g2h_init(xe, &ct->ctbs.g2h, &ct->bo->vmap);
 
@@ -419,10 +463,22 @@ int xe_guc_ct_enable(struct xe_guc_ct *ct)
 	if (ct_needs_safe_mode(ct))
 		ct_enter_safe_mode(ct);
 
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+	/*
+	 * The CT has now been reset so the dumper can be re-armed
+	 * after any existing dead state has been dumped.
+	 */
+	spin_lock_irq(&ct->dead.lock);
+	if (ct->dead.reason)
+		ct->dead.reason |= (1 << CT_DEAD_STATE_REARM);
+	spin_unlock_irq(&ct->dead.lock);
+#endif
+
 	return 0;
 
 err_out:
 	xe_gt_err(gt, "Failed to enable GuC CT (%pe)\n", ERR_PTR(err));
+	CT_DEAD(ct, NULL, SETUP);
 
 	return err;
 }
@@ -466,6 +522,19 @@ static bool h2g_has_room(struct xe_guc_ct *ct, u32 cmd_len)
 
 	if (cmd_len > h2g->info.space) {
 		h2g->info.head = desc_read(ct_to_xe(ct), h2g, head);
+
+		if (h2g->info.head > h2g->info.size) {
+			struct xe_device *xe = ct_to_xe(ct);
+			u32 desc_status = desc_read(xe, h2g, status);
+
+			desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+
+			xe_gt_err(ct_to_gt(ct), "CT: invalid head offset %u >= %u)\n",
+				  h2g->info.head, h2g->info.size);
+			CT_DEAD(ct, h2g, H2G_HAS_ROOM);
+			return false;
+		}
+
 		h2g->info.space = CIRC_SPACE(h2g->info.tail, h2g->info.head,
 					     h2g->info.size) -
 				  h2g->info.resv_space;
@@ -521,10 +590,24 @@ static void __g2h_reserve_space(struct xe_guc_ct *ct, u32 g2h_len, u32 num_g2h)
 
 static void __g2h_release_space(struct xe_guc_ct *ct, u32 g2h_len)
 {
+	bool bad = false;
+
 	lockdep_assert_held(&ct->fast_lock);
-	xe_gt_assert(ct_to_gt(ct), ct->ctbs.g2h.info.space + g2h_len <=
-		     ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space);
-	xe_gt_assert(ct_to_gt(ct), ct->g2h_outstanding);
+
+	bad = ct->ctbs.g2h.info.space + g2h_len >
+		     ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space;
+	bad |= !ct->g2h_outstanding;
+
+	if (bad) {
+		xe_gt_err(ct_to_gt(ct), "Invalid G2H release: %d + %d vs %d - %d -> %d vs %d, outstanding = %d!\n",
+			  ct->ctbs.g2h.info.space, g2h_len,
+			  ct->ctbs.g2h.info.size, ct->ctbs.g2h.info.resv_space,
+			  ct->ctbs.g2h.info.space + g2h_len,
+			  ct->ctbs.g2h.info.size - ct->ctbs.g2h.info.resv_space,
+			  ct->g2h_outstanding);
+		CT_DEAD(ct, &ct->ctbs.g2h, G2H_RELEASE);
+		return;
+	}
 
 	ct->ctbs.g2h.info.space += g2h_len;
 	if (!--ct->g2h_outstanding)
@@ -551,12 +634,43 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
 	u32 full_len;
 	struct iosys_map map = IOSYS_MAP_INIT_OFFSET(&h2g->cmds,
 							 tail * sizeof(u32));
+	u32 desc_status;
 
 	full_len = len + GUC_CTB_HDR_LEN;
 
 	lockdep_assert_held(&ct->lock);
 	xe_gt_assert(gt, full_len <= GUC_CTB_MSG_MAX_LEN);
-	xe_gt_assert(gt, tail <= h2g->info.size);
+
+	desc_status = desc_read(xe, h2g, status);
+	if (desc_status) {
+		xe_gt_err(gt, "CT write: non-zero status: %u\n", desc_status);
+		goto corrupted;
+	}
+
+	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
+		u32 desc_tail = desc_read(xe, h2g, tail);
+		u32 desc_head = desc_read(xe, h2g, head);
+
+		if (tail != desc_tail) {
+			desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_MISMATCH);
+			xe_gt_err(gt, "CT write: tail was modified %u != %u\n", desc_tail, tail);
+			goto corrupted;
+		}
+
+		if (tail > h2g->info.size) {
+			desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+			xe_gt_err(gt, "CT write: tail out of range: %u vs %u\n",
+				  tail, h2g->info.size);
+			goto corrupted;
+		}
+
+		if (desc_head >= h2g->info.size) {
+			desc_write(xe, h2g, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+			xe_gt_err(gt, "CT write: invalid head offset %u >= %u)\n",
+				  desc_head, h2g->info.size);
+			goto corrupted;
+		}
+	}
 
 	/* Command will wrap, zero fill (NOPs), return and check credits again */
 	if (tail + full_len > h2g->info.size) {
@@ -609,6 +723,10 @@ static int h2g_write(struct xe_guc_ct *ct, const u32 *action, u32 len,
 			     desc_read(xe, h2g, head), h2g->info.tail);
 
 	return 0;
+
+corrupted:
+	CT_DEAD(ct, &ct->ctbs.h2g, H2G_WRITE);
+	return -EPIPE;
 }
 
 /*
@@ -716,7 +834,6 @@ static int guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action, u32 len,
 {
 	struct xe_device *xe = ct_to_xe(ct);
 	struct xe_gt *gt = ct_to_gt(ct);
-	struct drm_printer p = xe_gt_info_printer(gt);
 	unsigned int sleep_period_ms = 1;
 	int ret;
 
@@ -769,8 +886,13 @@ try_again:
 			goto broken;
 #undef g2h_avail
 
-		if (dequeue_one_g2h(ct) < 0)
+		ret = dequeue_one_g2h(ct);
+		if (ret < 0) {
+			if (ret != -ECANCELED)
+				xe_gt_err(ct_to_gt(ct), "CTB receive failed (%pe)",
+					  ERR_PTR(ret));
 			goto broken;
+		}
 
 		goto try_again;
 	}
@@ -779,8 +901,7 @@ try_again:
 
 broken:
 	xe_gt_err(gt, "No forward process on H2G, reset required\n");
-	xe_guc_ct_print(ct, &p, true);
-	ct->ctbs.h2g.info.broken = true;
+	CT_DEAD(ct, &ct->ctbs.h2g, DEADLOCK);
 
 	return -EDEADLK;
 }
@@ -848,7 +969,7 @@ static bool retry_failure(struct xe_guc_ct *ct, int ret)
 #define ct_alive(ct)	\
 	(xe_guc_ct_enabled(ct) && !ct->ctbs.h2g.info.broken && \
 	 !ct->ctbs.g2h.info.broken)
-	if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct),  HZ * 5))
+	if (!wait_event_interruptible_timeout(ct->wq, ct_alive(ct), HZ * 5))
 		return false;
 #undef ct_alive
 
@@ -890,7 +1011,7 @@ retry_same_fence:
 			goto retry_same_fence;
 
 		if (!g2h_fence_needs_alloc(&g2h_fence))
-			xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno);
+			xa_erase(&ct->fence_lookup, g2h_fence.seqno);
 
 		return ret;
 	}
@@ -916,7 +1037,7 @@ retry_same_fence:
 	if (!ret) {
 		xe_gt_err(gt, "Timed out wait for G2H, fence %u, action %04x, done %s",
 			  g2h_fence.seqno, action[0], str_yes_no(g2h_fence.done));
-		xa_erase_irq(&ct->fence_lookup, g2h_fence.seqno);
+		xa_erase(&ct->fence_lookup, g2h_fence.seqno);
 		mutex_unlock(&ct->lock);
 		return -ETIME;
 	}
@@ -1028,6 +1149,7 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
 		else
 			xe_gt_err(gt, "unexpected response %u for FAST_REQ H2G fence 0x%x!\n",
 				  type, fence);
+		CT_DEAD(ct, NULL, PARSE_G2H_RESPONSE);
 
 		return -EPROTO;
 	}
@@ -1035,6 +1157,7 @@ static int parse_g2h_response(struct xe_guc_ct *ct, u32 *msg, u32 len)
 	g2h_fence = xa_erase(&ct->fence_lookup, fence);
 	if (unlikely(!g2h_fence)) {
 		/* Don't tear down channel, as send could've timed out */
+		/* CT_DEAD(ct, NULL, PARSE_G2H_UNKNOWN); */
 		xe_gt_warn(gt, "G2H fence (%u) not found!\n", fence);
 		g2h_release_space(ct, GUC_CTB_HXG_MSG_MAX_LEN);
 		return 0;
@@ -1079,7 +1202,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
 	if (unlikely(origin != GUC_HXG_ORIGIN_GUC)) {
 		xe_gt_err(gt, "G2H channel broken on read, origin=%u, reset required\n",
 			  origin);
-		ct->ctbs.g2h.info.broken = true;
+		CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_ORIGIN);
 
 		return -EPROTO;
 	}
@@ -1097,7 +1220,7 @@ static int parse_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
 	default:
 		xe_gt_err(gt, "G2H channel broken on read, type=%u, reset required\n",
 			  type);
-		ct->ctbs.g2h.info.broken = true;
+		CT_DEAD(ct, &ct->ctbs.g2h, PARSE_G2H_TYPE);
 
 		ret = -EOPNOTSUPP;
 	}
@@ -1140,6 +1263,8 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
 		/* Selftest only at the moment */
 		break;
 	case XE_GUC_ACTION_STATE_CAPTURE_NOTIFICATION:
+		ret = xe_guc_error_capture_handler(guc, payload, adj_len);
+		break;
 	case XE_GUC_ACTION_NOTIFY_FLUSH_LOG_BUFFER_TO_FILE:
 		/* FIXME: Handle this */
 		break;
@@ -1174,9 +1299,11 @@ static int process_g2h_msg(struct xe_guc_ct *ct, u32 *msg, u32 len)
 		xe_gt_err(gt, "unexpected G2H action 0x%04x\n", action);
 	}
 
-	if (ret)
+	if (ret) {
 		xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
 			  action, ERR_PTR(ret));
+		CT_DEAD(ct, NULL, PROCESS_FAILED);
+	}
 
 	return 0;
 }
@@ -1186,7 +1313,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
 	struct xe_device *xe = ct_to_xe(ct);
 	struct xe_gt *gt = ct_to_gt(ct);
 	struct guc_ctb *g2h = &ct->ctbs.g2h;
-	u32 tail, head, len;
+	u32 tail, head, len, desc_status;
 	s32 avail;
 	u32 action;
 	u32 *hxg;
@@ -1205,6 +1332,63 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
 
 	xe_gt_assert(gt, xe_guc_ct_enabled(ct));
 
+	desc_status = desc_read(xe, g2h, status);
+	if (desc_status) {
+		if (desc_status & GUC_CTB_STATUS_DISABLED) {
+			/*
+			 * Potentially valid if a CLIENT_RESET request resulted in
+			 * contexts/engines being reset. But should never happen as
+			 * no contexts should be active when CLIENT_RESET is sent.
+			 */
+			xe_gt_err(gt, "CT read: unexpected G2H after GuC has stopped!\n");
+			desc_status &= ~GUC_CTB_STATUS_DISABLED;
+		}
+
+		if (desc_status) {
+			xe_gt_err(gt, "CT read: non-zero status: %u\n", desc_status);
+			goto corrupted;
+		}
+	}
+
+	if (IS_ENABLED(CONFIG_DRM_XE_DEBUG)) {
+		u32 desc_tail = desc_read(xe, g2h, tail);
+		/*
+		u32 desc_head = desc_read(xe, g2h, head);
+
+		 * info.head and desc_head are updated back-to-back at the end of
+		 * this function and nowhere else. Hence, they cannot be different
+		 * unless two g2h_read calls are running concurrently. Which is not
+		 * possible because it is guarded by ct->fast_lock. And yet, some
+		 * discrete platforms are reguarly hitting this error :(.
+		 *
+		 * desc_head rolling backwards shouldn't cause any noticeable
+		 * problems - just a delay in GuC being allowed to proceed past that
+		 * point in the queue. So for now, just disable the error until it
+		 * can be root caused.
+		 *
+		if (g2h->info.head != desc_head) {
+			desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_MISMATCH);
+			xe_gt_err(gt, "CT read: head was modified %u != %u\n",
+				  desc_head, g2h->info.head);
+			goto corrupted;
+		}
+		 */
+
+		if (g2h->info.head > g2h->info.size) {
+			desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+			xe_gt_err(gt, "CT read: head out of range: %u vs %u\n",
+				  g2h->info.head, g2h->info.size);
+			goto corrupted;
+		}
+
+		if (desc_tail >= g2h->info.size) {
+			desc_write(xe, g2h, status, desc_status | GUC_CTB_STATUS_OVERFLOW);
+			xe_gt_err(gt, "CT read: invalid tail offset %u >= %u)\n",
+				  desc_tail, g2h->info.size);
+			goto corrupted;
+		}
+	}
+
 	/* Calculate DW available to read */
 	tail = desc_read(xe, g2h, tail);
 	avail = tail - g2h->info.head;
@@ -1221,9 +1405,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
 	if (len > avail) {
 		xe_gt_err(gt, "G2H channel broken on read, avail=%d, len=%d, reset required\n",
 			  avail, len);
-		g2h->info.broken = true;
-
-		return -EPROTO;
+		goto corrupted;
 	}
 
 	head = (g2h->info.head + 1) % g2h->info.size;
@@ -1269,6 +1451,10 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
 			     action, len, g2h->info.head, tail);
 
 	return len;
+
+corrupted:
+	CT_DEAD(ct, &ct->ctbs.g2h, G2H_READ);
+	return -EPROTO;
 }
 
 static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
@@ -1295,9 +1481,11 @@ static void g2h_fast_path(struct xe_guc_ct *ct, u32 *msg, u32 len)
 		xe_gt_warn(gt, "NOT_POSSIBLE");
 	}
 
-	if (ret)
+	if (ret) {
 		xe_gt_err(gt, "G2H action 0x%04x failed (%pe)\n",
 			  action, ERR_PTR(ret));
+		CT_DEAD(ct, NULL, FAST_G2H);
+	}
 }
 
 /**
@@ -1357,7 +1545,6 @@ static int dequeue_one_g2h(struct xe_guc_ct *ct)
 
 static void receive_g2h(struct xe_guc_ct *ct)
 {
-	struct xe_gt *gt = ct_to_gt(ct);
 	bool ongoing;
 	int ret;
 
@@ -1394,9 +1581,8 @@ static void receive_g2h(struct xe_guc_ct *ct)
 		mutex_unlock(&ct->lock);
 
 		if (unlikely(ret == -EPROTO || ret == -EOPNOTSUPP)) {
-			struct drm_printer p = xe_gt_info_printer(gt);
-
-			xe_guc_ct_print(ct, &p, false);
+			xe_gt_err(ct_to_gt(ct), "CT dequeue failed: %d", ret);
+			CT_DEAD(ct, NULL, G2H_RECV);
 			kick_reset(ct);
 		}
 	} while (ret == 1);
@@ -1412,49 +1598,34 @@ static void g2h_worker_func(struct work_struct *w)
 	receive_g2h(ct);
 }
 
-static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb,
-				     struct guc_ctb_snapshot *snapshot,
-				     bool atomic)
+static struct xe_guc_ct_snapshot *guc_ct_snapshot_alloc(struct xe_guc_ct *ct, bool atomic,
+							bool want_ctb)
 {
-	u32 head, tail;
-
-	xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0,
-			   sizeof(struct guc_ct_buffer_desc));
-	memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info));
+	struct xe_guc_ct_snapshot *snapshot;
 
-	snapshot->cmds = kmalloc_array(ctb->info.size, sizeof(u32),
-				       atomic ? GFP_ATOMIC : GFP_KERNEL);
+	snapshot = kzalloc(sizeof(*snapshot), atomic ? GFP_ATOMIC : GFP_KERNEL);
+	if (!snapshot)
+		return NULL;
 
-	if (!snapshot->cmds) {
-		drm_err(&xe->drm, "Skipping CTB commands snapshot. Only CTB info will be available.\n");
-		return;
+	if (ct->bo && want_ctb) {
+		snapshot->ctb_size = ct->bo->size;
+		snapshot->ctb = kmalloc(snapshot->ctb_size, atomic ? GFP_ATOMIC : GFP_KERNEL);
 	}
 
-	head = snapshot->desc.head;
-	tail = snapshot->desc.tail;
-
-	if (head != tail) {
-		struct iosys_map map =
-			IOSYS_MAP_INIT_OFFSET(&ctb->cmds, head * sizeof(u32));
-
-		while (head != tail) {
-			snapshot->cmds[head] = xe_map_rd(xe, &map, 0, u32);
-			++head;
-			if (head == ctb->info.size) {
-				head = 0;
-				map = ctb->cmds;
-			} else {
-				iosys_map_incr(&map, sizeof(u32));
-			}
-		}
-	}
+	return snapshot;
+}
+
+static void guc_ctb_snapshot_capture(struct xe_device *xe, struct guc_ctb *ctb,
+				     struct guc_ctb_snapshot *snapshot)
+{
+	xe_map_memcpy_from(xe, &snapshot->desc, &ctb->desc, 0,
+			   sizeof(struct guc_ct_buffer_desc));
+	memcpy(&snapshot->info, &ctb->info, sizeof(struct guc_ctb_info));
 }
 
 static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot,
 				   struct drm_printer *p)
 {
-	u32 head, tail;
-
 	drm_printf(p, "\tsize: %d\n", snapshot->info.size);
 	drm_printf(p, "\tresv_space: %d\n", snapshot->info.resv_space);
 	drm_printf(p, "\thead: %d\n", snapshot->info.head);
@@ -1464,63 +1635,46 @@ static void guc_ctb_snapshot_print(struct guc_ctb_snapshot *snapshot,
 	drm_printf(p, "\thead (memory): %d\n", snapshot->desc.head);
 	drm_printf(p, "\ttail (memory): %d\n", snapshot->desc.tail);
 	drm_printf(p, "\tstatus (memory): 0x%x\n", snapshot->desc.status);
+}
 
-	if (!snapshot->cmds)
-		return;
+static struct xe_guc_ct_snapshot *guc_ct_snapshot_capture(struct xe_guc_ct *ct, bool atomic,
+							  bool want_ctb)
+{
+	struct xe_device *xe = ct_to_xe(ct);
+	struct xe_guc_ct_snapshot *snapshot;
 
-	head = snapshot->desc.head;
-	tail = snapshot->desc.tail;
+	snapshot = guc_ct_snapshot_alloc(ct, atomic, want_ctb);
+	if (!snapshot) {
+		xe_gt_err(ct_to_gt(ct), "Skipping CTB snapshot entirely.\n");
+		return NULL;
+	}
 
-	while (head != tail) {
-		drm_printf(p, "\tcmd[%d]: 0x%08x\n", head,
-			   snapshot->cmds[head]);
-		++head;
-		if (head == snapshot->info.size)
-			head = 0;
+	if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) {
+		snapshot->ct_enabled = true;
+		snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding);
+		guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g, &snapshot->h2g);
+		guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h, &snapshot->g2h);
 	}
-}
 
-static void guc_ctb_snapshot_free(struct guc_ctb_snapshot *snapshot)
-{
-	kfree(snapshot->cmds);
+	if (ct->bo && snapshot->ctb)
+		xe_map_memcpy_from(xe, snapshot->ctb, &ct->bo->vmap, 0, snapshot->ctb_size);
+
+	return snapshot;
 }
 
 /**
  * xe_guc_ct_snapshot_capture - Take a quick snapshot of the CT state.
  * @ct: GuC CT object.
- * @atomic: Boolean to indicate if this is called from atomic context like
- * reset or CTB handler or from some regular path like debugfs.
  *
  * This can be printed out in a later stage like during dev_coredump
- * analysis.
+ * analysis. This is safe to be called during atomic context.
  *
  * Returns: a GuC CT snapshot object that must be freed by the caller
  * by using `xe_guc_ct_snapshot_free`.
  */
-struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct,
-						      bool atomic)
+struct xe_guc_ct_snapshot *xe_guc_ct_snapshot_capture(struct xe_guc_ct *ct)
 {
-	struct xe_device *xe = ct_to_xe(ct);
-	struct xe_guc_ct_snapshot *snapshot;
-
-	snapshot = kzalloc(sizeof(*snapshot),
-			   atomic ? GFP_ATOMIC : GFP_KERNEL);
-
-	if (!snapshot) {
-		drm_err(&xe->drm, "Skipping CTB snapshot entirely.\n");
-		return NULL;
-	}
-
-	if (xe_guc_ct_enabled(ct) || ct->state == XE_GUC_CT_STATE_STOPPED) {
-		snapshot->ct_enabled = true;
-		snapshot->g2h_outstanding = READ_ONCE(ct->g2h_outstanding);
-		guc_ctb_snapshot_capture(xe, &ct->ctbs.h2g,
-					 &snapshot->h2g, atomic);
-		guc_ctb_snapshot_capture(xe, &ct->ctbs.g2h,
-					 &snapshot->g2h, atomic);
-	}
-
-	return snapshot;
+	return guc_ct_snapshot_capture(ct, true, true);
 }
 
 /**
@@ -1540,11 +1694,13 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot,
 		drm_puts(p, "H2G CTB (all sizes in DW):\n");
 		guc_ctb_snapshot_print(&snapshot->h2g, p);
 
-		drm_puts(p, "\nG2H CTB (all sizes in DW):\n");
+		drm_puts(p, "G2H CTB (all sizes in DW):\n");
 		guc_ctb_snapshot_print(&snapshot->g2h, p);
-
 		drm_printf(p, "\tg2h outstanding: %d\n",
 			   snapshot->g2h_outstanding);
+
+		if (snapshot->ctb)
+			xe_print_blob_ascii85(p, "CTB data", snapshot->ctb, 0, snapshot->ctb_size);
 	} else {
 		drm_puts(p, "CT disabled\n");
 	}
@@ -1562,8 +1718,7 @@ void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot)
 	if (!snapshot)
 		return;
 
-	guc_ctb_snapshot_free(&snapshot->h2g);
-	guc_ctb_snapshot_free(&snapshot->g2h);
+	kfree(snapshot->ctb);
 	kfree(snapshot);
 }
 
@@ -1571,16 +1726,121 @@ void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot)
  * xe_guc_ct_print - GuC CT Print.
  * @ct: GuC CT.
  * @p: drm_printer where it will be printed out.
- * @atomic: Boolean to indicate if this is called from atomic context like
- * reset or CTB handler or from some regular path like debugfs.
+ * @want_ctb: Should the full CTB content be dumped (vs just the headers)
  *
- * This function quickly capture a snapshot and immediately print it out.
+ * This function will quickly capture a snapshot of the CT state
+ * and immediately print it out.
  */
-void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool atomic)
+void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb)
 {
 	struct xe_guc_ct_snapshot *snapshot;
 
-	snapshot = xe_guc_ct_snapshot_capture(ct, atomic);
+	snapshot = guc_ct_snapshot_capture(ct, false, want_ctb);
 	xe_guc_ct_snapshot_print(snapshot, p);
 	xe_guc_ct_snapshot_free(snapshot);
 }
+
+#if IS_ENABLED(CONFIG_DRM_XE_DEBUG)
+static void ct_dead_capture(struct xe_guc_ct *ct, struct guc_ctb *ctb, u32 reason_code)
+{
+	struct xe_guc_log_snapshot *snapshot_log;
+	struct xe_guc_ct_snapshot *snapshot_ct;
+	struct xe_guc *guc = ct_to_guc(ct);
+	unsigned long flags;
+	bool have_capture;
+
+	if (ctb)
+		ctb->info.broken = true;
+
+	/* Ignore further errors after the first dump until a reset */
+	if (ct->dead.reported)
+		return;
+
+	spin_lock_irqsave(&ct->dead.lock, flags);
+
+	/* And only capture one dump at a time */
+	have_capture = ct->dead.reason & (1 << CT_DEAD_STATE_CAPTURE);
+	ct->dead.reason |= (1 << reason_code) |
+			   (1 << CT_DEAD_STATE_CAPTURE);
+
+	spin_unlock_irqrestore(&ct->dead.lock, flags);
+
+	if (have_capture)
+		return;
+
+	snapshot_log = xe_guc_log_snapshot_capture(&guc->log, true);
+	snapshot_ct = xe_guc_ct_snapshot_capture((ct));
+
+	spin_lock_irqsave(&ct->dead.lock, flags);
+
+	if (ct->dead.snapshot_log || ct->dead.snapshot_ct) {
+		xe_gt_err(ct_to_gt(ct), "Got unexpected dead CT capture!\n");
+		xe_guc_log_snapshot_free(snapshot_log);
+		xe_guc_ct_snapshot_free(snapshot_ct);
+	} else {
+		ct->dead.snapshot_log = snapshot_log;
+		ct->dead.snapshot_ct = snapshot_ct;
+	}
+
+	spin_unlock_irqrestore(&ct->dead.lock, flags);
+
+	queue_work(system_unbound_wq, &(ct)->dead.worker);
+}
+
+static void ct_dead_print(struct xe_dead_ct *dead)
+{
+	struct xe_guc_ct *ct = container_of(dead, struct xe_guc_ct, dead);
+	struct xe_device *xe = ct_to_xe(ct);
+	struct xe_gt *gt = ct_to_gt(ct);
+	static int g_count;
+	struct drm_printer ip = xe_gt_info_printer(gt);
+	struct drm_printer lp = drm_line_printer(&ip, "Capture", ++g_count);
+
+	if (!dead->reason) {
+		xe_gt_err(gt, "CTB is dead for no reason!?\n");
+		return;
+	}
+
+	drm_printf(&lp, "CTB is dead - reason=0x%X\n", dead->reason);
+
+	/* Can't generate a genuine core dump at this point, so just do the good bits */
+	drm_puts(&lp, "**** Xe Device Coredump ****\n");
+	xe_device_snapshot_print(xe, &lp);
+
+	drm_printf(&lp, "**** GT #%d ****\n", gt->info.id);
+	drm_printf(&lp, "\tTile: %d\n", gt->tile->id);
+
+	drm_puts(&lp, "**** GuC Log ****\n");
+	xe_guc_log_snapshot_print(dead->snapshot_log, &lp);
+
+	drm_puts(&lp, "**** GuC CT ****\n");
+	xe_guc_ct_snapshot_print(dead->snapshot_ct, &lp);
+
+	drm_puts(&lp, "Done.\n");
+}
+
+static void ct_dead_worker_func(struct work_struct *w)
+{
+	struct xe_guc_ct *ct = container_of(w, struct xe_guc_ct, dead.worker);
+
+	if (!ct->dead.reported) {
+		ct->dead.reported = true;
+		ct_dead_print(&ct->dead);
+	}
+
+	spin_lock_irq(&ct->dead.lock);
+
+	xe_guc_log_snapshot_free(ct->dead.snapshot_log);
+	ct->dead.snapshot_log = NULL;
+	xe_guc_ct_snapshot_free(ct->dead.snapshot_ct);
+	ct->dead.snapshot_ct = NULL;
+
+	if (ct->dead.reason & (1 << CT_DEAD_STATE_REARM)) {
+		/* A reset has occurred so re-arm the error reporting */
+		ct->dead.reason = 0;
+		ct->dead.reported = false;
+	}
+
+	spin_unlock_irq(&ct->dead.lock);
+}
+#endif