summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/scheduler/sched_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/scheduler/sched_main.c')
-rw-r--r--drivers/gpu/drm/scheduler/sched_main.c219
1 files changed, 129 insertions, 90 deletions
diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index dbb69063b3d5..19fc601c9eeb 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -60,8 +60,6 @@
static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
-
/**
* drm_sched_rq_init - initialize a given run queue struct
*
@@ -286,8 +284,6 @@ static void drm_sched_job_finish(struct work_struct *work)
cancel_delayed_work_sync(&sched->work_tdr);
spin_lock_irqsave(&sched->job_list_lock, flags);
- /* remove job from ring_mirror_list */
- list_del_init(&s_job->node);
/* queue TDR for next job */
drm_sched_start_timeout(sched);
spin_unlock_irqrestore(&sched->job_list_lock, flags);
@@ -295,22 +291,11 @@ static void drm_sched_job_finish(struct work_struct *work)
sched->ops->free_job(s_job);
}
-static void drm_sched_job_finish_cb(struct dma_fence *f,
- struct dma_fence_cb *cb)
-{
- struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
- finish_cb);
- schedule_work(&job->finish_work);
-}
-
static void drm_sched_job_begin(struct drm_sched_job *s_job)
{
struct drm_gpu_scheduler *sched = s_job->sched;
unsigned long flags;
- dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb,
- drm_sched_job_finish_cb);
-
spin_lock_irqsave(&sched->job_list_lock, flags);
list_add_tail(&s_job->node, &sched->ring_mirror_list);
drm_sched_start_timeout(sched);
@@ -335,6 +320,51 @@ static void drm_sched_job_timedout(struct work_struct *work)
spin_unlock_irqrestore(&sched->job_list_lock, flags);
}
+ /**
+ * drm_sched_increase_karma - Update sched_entity guilty flag
+ *
+ * @bad: The job guilty of time out
+ *
+ * Increment on every hang caused by the 'bad' job. If this exceeds the hang
+ * limit of the scheduler then the respective sched entity is marked guilty and
+ * jobs from it will not be scheduled further
+ */
+void drm_sched_increase_karma(struct drm_sched_job *bad)
+{
+ int i;
+ struct drm_sched_entity *tmp;
+ struct drm_sched_entity *entity;
+ struct drm_gpu_scheduler *sched = bad->sched;
+
+ /* don't increase @bad's karma if it's from KERNEL RQ,
+ * because sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+ * corrupt but keep in mind that kernel jobs always considered good.
+ */
+ if (bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
+ atomic_inc(&bad->karma);
+ for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL;
+ i++) {
+ struct drm_sched_rq *rq = &sched->sched_rq[i];
+
+ spin_lock(&rq->lock);
+ list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
+ if (bad->s_fence->scheduled.context ==
+ entity->fence_context) {
+ if (atomic_read(&bad->karma) >
+ bad->sched->hang_limit)
+ if (entity->guilty)
+ atomic_set(entity->guilty, 1);
+ break;
+ }
+ }
+ spin_unlock(&rq->lock);
+ if (&entity->list != &rq->entities)
+ break;
+ }
+ }
+}
+EXPORT_SYMBOL(drm_sched_increase_karma);
+
/**
* drm_sched_hw_job_reset - stop the scheduler if it contains the bad job
*
@@ -342,50 +372,42 @@ static void drm_sched_job_timedout(struct work_struct *work)
* @bad: bad scheduler job
*
*/
-void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
+void drm_sched_stop(struct drm_gpu_scheduler *sched)
{
struct drm_sched_job *s_job;
- struct drm_sched_entity *entity, *tmp;
unsigned long flags;
- int i;
+ struct dma_fence *last_fence = NULL;
+ kthread_park(sched->thread);
+
+ /*
+ * Verify all the signaled jobs in mirror list are removed from the ring
+ * by waiting for the latest job to enter the list. This should insure that
+ * also all the previous jobs that were in flight also already singaled
+ * and removed from the list.
+ */
spin_lock_irqsave(&sched->job_list_lock, flags);
list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
if (s_job->s_fence->parent &&
dma_fence_remove_callback(s_job->s_fence->parent,
- &s_job->s_fence->cb)) {
+ &s_job->cb)) {
dma_fence_put(s_job->s_fence->parent);
s_job->s_fence->parent = NULL;
atomic_dec(&sched->hw_rq_count);
+ } else {
+ last_fence = dma_fence_get(&s_job->s_fence->finished);
+ break;
}
}
spin_unlock_irqrestore(&sched->job_list_lock, flags);
- if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
- atomic_inc(&bad->karma);
- /* don't increase @bad's karma if it's from KERNEL RQ,
- * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
- * corrupt but keep in mind that kernel jobs always considered good.
- */
- for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) {
- struct drm_sched_rq *rq = &sched->sched_rq[i];
-
- spin_lock(&rq->lock);
- list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
- if (bad->s_fence->scheduled.context == entity->fence_context) {
- if (atomic_read(&bad->karma) > bad->sched->hang_limit)
- if (entity->guilty)
- atomic_set(entity->guilty, 1);
- break;
- }
- }
- spin_unlock(&rq->lock);
- if (&entity->list != &rq->entities)
- break;
- }
+ if (last_fence) {
+ dma_fence_wait(last_fence, false);
+ dma_fence_put(last_fence);
}
}
-EXPORT_SYMBOL(drm_sched_hw_job_reset);
+
+EXPORT_SYMBOL(drm_sched_stop);
/**
* drm_sched_job_recovery - recover jobs after a reset
@@ -393,18 +415,58 @@ EXPORT_SYMBOL(drm_sched_hw_job_reset);
* @sched: scheduler instance
*
*/
-void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
+void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery)
{
struct drm_sched_job *s_job, *tmp;
- bool found_guilty = false;
- unsigned long flags;
int r;
- spin_lock_irqsave(&sched->job_list_lock, flags);
+ if (!full_recovery)
+ goto unpark;
+
+ /*
+ * Locking the list is not required here as the sched thread is parked
+ * so no new jobs are being pushed in to HW and in drm_sched_stop we
+ * flushed all the jobs who were still in mirror list but who already
+ * signaled and removed them self from the list. Also concurrent
+ * GPU recovers can't run in parallel.
+ */
+ list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
+ struct dma_fence *fence = s_job->s_fence->parent;
+
+ if (fence) {
+ r = dma_fence_add_callback(fence, &s_job->cb,
+ drm_sched_process_job);
+ if (r == -ENOENT)
+ drm_sched_process_job(fence, &s_job->cb);
+ else if (r)
+ DRM_ERROR("fence add callback failed (%d)\n",
+ r);
+ } else
+ drm_sched_process_job(NULL, &s_job->cb);
+ }
+
+ drm_sched_start_timeout(sched);
+
+unpark:
+ kthread_unpark(sched->thread);
+}
+EXPORT_SYMBOL(drm_sched_start);
+
+/**
+ * drm_sched_resubmit_jobs - helper to relunch job from mirror ring list
+ *
+ * @sched: scheduler instance
+ *
+ */
+void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched)
+{
+ struct drm_sched_job *s_job, *tmp;
+ uint64_t guilty_context;
+ bool found_guilty = false;
+
+ /*TODO DO we need spinlock here ? */
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
struct drm_sched_fence *s_fence = s_job->s_fence;
- struct dma_fence *fence;
- uint64_t guilty_context;
if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
found_guilty = true;
@@ -414,31 +476,11 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
dma_fence_set_error(&s_fence->finished, -ECANCELED);
- spin_unlock_irqrestore(&sched->job_list_lock, flags);
- fence = sched->ops->run_job(s_job);
+ s_job->s_fence->parent = sched->ops->run_job(s_job);
atomic_inc(&sched->hw_rq_count);
-
- if (fence) {
- s_fence->parent = dma_fence_get(fence);
- r = dma_fence_add_callback(fence, &s_fence->cb,
- drm_sched_process_job);
- if (r == -ENOENT)
- drm_sched_process_job(fence, &s_fence->cb);
- else if (r)
- DRM_ERROR("fence add callback failed (%d)\n",
- r);
- dma_fence_put(fence);
- } else {
- if (s_fence->finished.error < 0)
- drm_sched_expel_job_unlocked(s_job);
- drm_sched_process_job(NULL, &s_fence->cb);
- }
- spin_lock_irqsave(&sched->job_list_lock, flags);
}
- drm_sched_start_timeout(sched);
- spin_unlock_irqrestore(&sched->job_list_lock, flags);
}
-EXPORT_SYMBOL(drm_sched_job_recovery);
+EXPORT_SYMBOL(drm_sched_resubmit_jobs);
/**
* drm_sched_job_init - init a scheduler job
@@ -552,18 +594,27 @@ drm_sched_select_entity(struct drm_gpu_scheduler *sched)
*/
static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
{
- struct drm_sched_fence *s_fence =
- container_of(cb, struct drm_sched_fence, cb);
+ struct drm_sched_job *s_job = container_of(cb, struct drm_sched_job, cb);
+ struct drm_sched_fence *s_fence = s_job->s_fence;
struct drm_gpu_scheduler *sched = s_fence->sched;
+ unsigned long flags;
+
+ cancel_delayed_work(&sched->work_tdr);
- dma_fence_get(&s_fence->finished);
atomic_dec(&sched->hw_rq_count);
atomic_dec(&sched->num_jobs);
+
+ spin_lock_irqsave(&sched->job_list_lock, flags);
+ /* remove job from ring_mirror_list */
+ list_del_init(&s_job->node);
+ spin_unlock_irqrestore(&sched->job_list_lock, flags);
+
drm_sched_fence_finished(s_fence);
trace_drm_sched_process_job(s_fence);
- dma_fence_put(&s_fence->finished);
wake_up_interruptible(&sched->wake_up_worker);
+
+ schedule_work(&s_job->finish_work);
}
/**
@@ -626,34 +677,22 @@ static int drm_sched_main(void *param)
if (fence) {
s_fence->parent = dma_fence_get(fence);
- r = dma_fence_add_callback(fence, &s_fence->cb,
+ r = dma_fence_add_callback(fence, &sched_job->cb,
drm_sched_process_job);
if (r == -ENOENT)
- drm_sched_process_job(fence, &s_fence->cb);
+ drm_sched_process_job(fence, &sched_job->cb);
else if (r)
DRM_ERROR("fence add callback failed (%d)\n",
r);
dma_fence_put(fence);
- } else {
- if (s_fence->finished.error < 0)
- drm_sched_expel_job_unlocked(sched_job);
- drm_sched_process_job(NULL, &s_fence->cb);
- }
+ } else
+ drm_sched_process_job(NULL, &sched_job->cb);
wake_up(&sched->job_scheduled);
}
return 0;
}
-static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job)
-{
- struct drm_gpu_scheduler *sched = s_job->sched;
-
- spin_lock(&sched->job_list_lock);
- list_del_init(&s_job->node);
- spin_unlock(&sched->job_list_lock);
-}
-
/**
* drm_sched_init - Init a gpu scheduler instance
*