diff options
| -rw-r--r-- | drivers/gpu/drm/xe/xe_guc_submit.c | 49 |
1 files changed, 35 insertions, 14 deletions
diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c b/drivers/gpu/drm/xe/xe_guc_submit.c index a4a8f0d41fe8..42110e01b7d0 100644 --- a/drivers/gpu/drm/xe/xe_guc_submit.c +++ b/drivers/gpu/drm/xe/xe_guc_submit.c @@ -157,6 +157,11 @@ static void set_exec_queue_banned(struct xe_exec_queue *q) atomic_or(EXEC_QUEUE_STATE_BANNED, &q->guc->state); } +static void clear_exec_queue_banned(struct xe_exec_queue *q) +{ + atomic_andnot(EXEC_QUEUE_STATE_BANNED, &q->guc->state); +} + static bool exec_queue_suspended(struct xe_exec_queue *q) { return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_SUSPENDED; @@ -1361,7 +1366,8 @@ static bool check_timeout(struct xe_exec_queue *q, struct xe_sched_job *job) xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), q->guc->id); - return xe_sched_invalidate_job(job, 2); + /* GuC never scheduled this job - let the caller trigger a GT reset. */ + return true; } ctx_timestamp = lower_32_bits(xe_lrc_timestamp(q->lrc[0])); @@ -1458,6 +1464,21 @@ static void disable_scheduling(struct xe_exec_queue *q, bool immediate) G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1); } +/* + * Recover via GT reset for a kernel queue, or for a GuC scheduling failure (job + * never started) on a queue that was not already killed or banned. An already + * banned queue must stay banned, so its unstarted jobs do not clear the ban or + * trigger a reset. + */ +static bool timeout_needs_gt_reset(struct xe_exec_queue *q, struct xe_sched_job *job, + bool skip_timeout_check) +{ + if (q->flags & EXEC_QUEUE_FLAG_KERNEL) + return true; + + return !skip_timeout_check && !xe_sched_job_started(job); +} + static enum drm_gpu_sched_stat guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) { @@ -1606,19 +1627,19 @@ trigger_reset: xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job), q->guc->id, q->flags); - /* - * Kernel jobs should never fail, nor should VM jobs if they do - * somethings has gone wrong and the GT needs a reset - */ - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL, - "Kernel-submitted job timed out\n"); - xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q), - "VM job timed out on non-killed execqueue\n"); - if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL || - (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) { - if (!xe_sched_invalidate_job(job, 2)) { - xe_gt_reset_async(q->gt); - goto rearm; + if (!wedged) { + if (timeout_needs_gt_reset(q, job, skip_timeout_check)) { + if (!xe_sched_invalidate_job(job, 2)) { + clear_exec_queue_banned(q); + xe_gt_reset_async(q->gt); + goto rearm; + } + if (q->flags & EXEC_QUEUE_FLAG_KERNEL) { + xe_gt_WARN(q->gt, true, "Kernel-submitted job timed out\n"); + xe_device_declare_wedged(gt_to_xe(q->gt)); + } + } else if (q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)) { + xe_gt_WARN(q->gt, true, "VM job timed out on non-killed execqueue\n"); } } |
