summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>2025-12-11 04:03:01 +0300
committerNiranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>2025-12-12 06:21:53 +0300
commit8b81c76885e8f61681cf4c7d6d0ce816809e3b2f (patch)
treeb14275aee662c0cbdc0c9c288144ef51453553c2
parentbb9343f122add786c57a6e8865209a9c6671bc9b (diff)
downloadlinux-8b81c76885e8f61681cf4c7d6d0ce816809e3b2f.tar.xz
drm/xe/multi_queue: Teardown group upon job timeout
Upon a job timeout, teardown the multi-queue group by triggering TDR on all queues of the multi-queue group and by skipping timeout checks in them. v5: Ban the group while triggering TDR for the guc reported errors Add FIXME in TDR to take multi-queue group off HW (Matt Brost) v6: Trigger cleanup of group only for multi-queue case Signed-off-by: Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com> Reviewed-by: Matthew Brost <matthew.brost@intel.com> Link: https://patch.msgid.link/20251211010249.1647839-32-niranjana.vishwanathapura@intel.com
-rw-r--r--drivers/gpu/drm/xe/xe_exec_queue_types.h2
-rw-r--r--drivers/gpu/drm/xe/xe_guc_submit.c23
2 files changed, 24 insertions, 1 deletions
diff --git a/drivers/gpu/drm/xe/xe_exec_queue_types.h b/drivers/gpu/drm/xe/xe_exec_queue_types.h
index 8a954ee62505..5fc516b0bb77 100644
--- a/drivers/gpu/drm/xe/xe_exec_queue_types.h
+++ b/drivers/gpu/drm/xe/xe_exec_queue_types.h
@@ -64,6 +64,8 @@ struct xe_exec_queue_group {
struct mutex list_lock;
/** @sync_pending: CGP_SYNC_DONE g2h response pending */
bool sync_pending;
+ /** @banned: Group banned */
+ bool banned;
};
/**
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c
index e8bde976e4c8..f678b806acaa 100644
--- a/drivers/gpu/drm/xe/xe_guc_submit.c
+++ b/drivers/gpu/drm/xe/xe_guc_submit.c
@@ -602,6 +602,8 @@ static void xe_guc_exec_queue_group_trigger_cleanup(struct xe_exec_queue *q)
xe_gt_assert(guc_to_gt(exec_queue_to_guc(q)),
xe_exec_queue_is_multi_queue(q));
+ /* Group banned, skip timeout check in TDR */
+ WRITE_ONCE(group->banned, true);
xe_guc_exec_queue_trigger_cleanup(primary);
mutex_lock(&group->list_lock);
@@ -617,6 +619,9 @@ static void xe_guc_exec_queue_reset_trigger_cleanup(struct xe_exec_queue *q)
struct xe_exec_queue_group *group = q->multi_queue.group;
struct xe_exec_queue *eq;
+ /* Group banned, skip timeout check in TDR */
+ WRITE_ONCE(group->banned, true);
+
set_exec_queue_reset(primary);
if (!exec_queue_banned(primary) && !exec_queue_check_timeout(primary))
xe_guc_exec_queue_trigger_cleanup(primary);
@@ -1487,6 +1492,19 @@ guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
exec_queue_killed_or_banned_or_wedged(q) ||
exec_queue_destroyed(q);
+ /* Skip timeout check if multi-queue group is banned */
+ if (xe_exec_queue_is_multi_queue(q) &&
+ READ_ONCE(q->multi_queue.group->banned))
+ skip_timeout_check = true;
+
+ /*
+ * FIXME: In multi-queue scenario, the TDR must ensure that the whole
+ * multi-queue group is off the HW before signaling the fences to avoid
+ * possible memory corruptions. This means disabling scheduling on the
+ * primary queue before or during the secondary queue's TDR. Need to
+ * implement this in least obtrusive way.
+ */
+
/*
* If devcoredump not captured and GuC capture for the job is not ready
* do manual capture first and decide later if we need to use it
@@ -1639,7 +1657,10 @@ trigger_reset:
xe_sched_add_pending_job(sched, job);
xe_sched_submission_start(sched);
- xe_guc_exec_queue_trigger_cleanup(q);
+ if (xe_exec_queue_is_multi_queue(q))
+ xe_guc_exec_queue_group_trigger_cleanup(q);
+ else
+ xe_guc_exec_queue_trigger_cleanup(q);
/* Mark all outstanding jobs as bad, thus completing them */
spin_lock(&sched->base.job_list_lock);