drm/xe/multi_queue: Teardown group upon job timeout

Upon a job timeout, teardown the multi-queue group by triggering TDR on all queues of the multi-queue group and by skipping timeout checks in them. v5: Ban the group while triggering TDR for the guc reported errors Add FIXME in TDR to take multi-queue group off HW (Matt Brost) v6: Trigger cleanup of group only for multi-queue case Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patch.msgid.link/20251211010249.1647839-32-niranjana.vishwanathapura@intel.com
author: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> 2025-12-11 04:03:01 +0300
committer: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> 2025-12-12 06:21:53 +0300
commit: 8b81c76885e8f61681cf4c7d6d0ce816809e3b2f (patch)
tree: b14275aee662c0cbdc0c9c288144ef51453553c2
parent: bb9343f122add786c57a6e8865209a9c6671bc9b (diff)
download: linux-8b81c76885e8f61681cf4c7d6d0ce816809e3b2f.tar.xz
2 files changed, 24 insertions, 1 deletions
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 8a954ee62505..5fc516b0bb77 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -64,6 +64,8 @@ struct xe_exec_queue_group {
 	struct mutex list_lock;
 	/** @sync_pending: CGP_SYNC_DONE g2h response pending */
 	bool sync_pending;
+	/** @banned: Group banned */
+	bool banned;
 };
 
 /**
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index e8bde976e4c8..f678b806acaa 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q)
 	xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)),
 		     xe_exec_queue_is_multi_queue(q));
 
+	/* Group banned, skip timeout check in TDR */
+	WRITE_ONCE(group->banned, true);
 	xe_guc_exec_queue_trigger_cleanup(primary);
 
 	mutex_lock(&group->list_lock);
@@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q)
 		struct xe_exec_queue_group *group = q->multi_queue.group;
 		struct xe_exec_queue *eq;
 
+		/* Group banned, skip timeout check in TDR */
+		WRITE_ONCE(group->banned, true);
+
 		set_exec_queue_reset(primary);
 		if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary))
 			xe_guc_exec_queue_trigger_cleanup(primary);
@@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
 		exec_queue_killed_or_banned_or_wedged(q) ||
 		exec_queue_destroyed(q);
 
+	/* Skip timeout check if multi-queue group is banned */
+	if (xe_exec_queue_is_multi_queue(q) &&
+	    READ_ONCE(q->multi_queue.group->banned))
+		skip_timeout_check = true;
+
+	/*
+	 * FIXME: In multi-queue scenario, the TDR must ensure that the whole
+	 * multi-queue group is off the HW before signaling the fences to avoid
+	 * possible memory corruptions. This means disabling scheduling on the
+	 * primary queue before or during the secondary queue's TDR. Need to
+	 * implement this in least obtrusive way.
+	 */
+
 	/*
 	 * If devcoredump not captured and GuC capture for the job is not ready
 	 * do manual capture first and decide later if we need to use it
@@ -1639,7 +1657,10 @@ trigger_reset:
 	xe_sched_add_pending_job(sched, job);
 	xe_sched_submission_start(sched);
 
-	xe_guc_exec_queue_trigger_cleanup(q);
+	if (xe_exec_queue_is_multi_queue(q))
+		xe_guc_exec_queue_group_trigger_cleanup(q);
+	else
+		xe_guc_exec_queue_trigger_cleanup(q);
 
 	/* Mark all outstanding jobs as bad, thus completing them */
 	spin_lock(&sched->base.job_list_lock);
author	Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>	2025-12-11 04:03:01 +0300
committer	Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>	2025-12-12 06:21:53 +0300
commit	8b81c76885e8f61681cf4c7d6d0ce816809e3b2f (patch)
tree	b14275aee662c0cbdc0c9c288144ef51453553c2
parent	bb9343f122add786c57a6e8865209a9c6671bc9b (diff)
download	linux-8b81c76885e8f61681cf4c7d6d0ce816809e3b2f.tar.xz