diff options
Diffstat (limited to 'drivers/gpu/drm/i915/selftests/intel_hangcheck.c')
-rw-r--r-- | drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 414 |
1 files changed, 257 insertions, 157 deletions
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index df7898c8edcb..438e0b045a2c 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -25,10 +25,14 @@ #include <linux/kthread.h> #include "../i915_selftest.h" +#include "i915_random.h" +#include "igt_flush_test.h" #include "mock_context.h" #include "mock_drm.h" +#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ + struct hang { struct drm_i915_private *i915; struct drm_i915_gem_object *hws; @@ -250,58 +254,6 @@ static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); } -struct wedge_me { - struct delayed_work work; - struct drm_i915_private *i915; - const void *symbol; -}; - -static void wedge_me(struct work_struct *work) -{ - struct wedge_me *w = container_of(work, typeof(*w), work.work); - - pr_err("%pS timed out, cancelling all further testing.\n", - w->symbol); - i915_gem_set_wedged(w->i915); -} - -static void __init_wedge(struct wedge_me *w, - struct drm_i915_private *i915, - long timeout, - const void *symbol) -{ - w->i915 = i915; - w->symbol = symbol; - - INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me); - schedule_delayed_work(&w->work, timeout); -} - -static void __fini_wedge(struct wedge_me *w) -{ - cancel_delayed_work_sync(&w->work); - destroy_delayed_work_on_stack(&w->work); - w->i915 = NULL; -} - -#define wedge_on_timeout(W, DEV, TIMEOUT) \ - for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \ - (W)->i915; \ - __fini_wedge((W))) - -static noinline int -flush_test(struct drm_i915_private *i915, unsigned int flags) -{ - struct wedge_me w; - - cond_resched(); - - wedge_on_timeout(&w, i915, HZ) - i915_gem_wait_for_idle(i915, flags); - - return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0; -} - static void hang_fini(struct hang *h) { *h->batch = MI_BATCH_BUFFER_END; @@ -315,10 +267,10 @@ static void hang_fini(struct hang *h) kernel_context_close(h->ctx); - flush_test(h->i915, I915_WAIT_LOCKED); + igt_flush_test(h->i915, I915_WAIT_LOCKED); } -static bool wait_for_hang(struct hang *h, struct i915_request *rq) +static bool wait_until_running(struct hang *h, struct i915_request *rq) { return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), rq->fence.seqno), @@ -433,7 +385,7 @@ static int igt_global_reset(void *arg) mutex_lock(&i915->drm.struct_mutex); reset_count = i915_reset_count(&i915->gpu_error); - i915_reset(i915, I915_RESET_QUIET); + i915_reset(i915, ALL_ENGINES, NULL); if (i915_reset_count(&i915->gpu_error) == reset_count) { pr_err("No GPU reset recorded!\n"); @@ -450,6 +402,11 @@ static int igt_global_reset(void *arg) return err; } +static bool wait_for_idle(struct intel_engine_cs *engine) +{ + return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; +} + static int __igt_reset_engine(struct drm_i915_private *i915, bool active) { struct intel_engine_cs *engine; @@ -477,12 +434,21 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) if (active && !intel_engine_can_store_dword(engine)) continue; + if (!wait_for_idle(engine)) { + pr_err("%s failed to idle before reset\n", + engine->name); + err = -EIO; + break; + } + reset_count = i915_reset_count(&i915->gpu_error); reset_engine_count = i915_reset_engine_count(&i915->gpu_error, engine); set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); do { + u32 seqno = intel_engine_get_seqno(engine); + if (active) { struct i915_request *rq; @@ -498,7 +464,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) __i915_request_add(rq, true); mutex_unlock(&i915->drm.struct_mutex); - if (!wait_for_hang(&h, rq)) { + if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); pr_err("%s: Failed to start request %x, at %x\n", @@ -511,14 +477,12 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) break; } + GEM_BUG_ON(!rq->global_seqno); + seqno = rq->global_seqno - 1; i915_request_put(rq); } - engine->hangcheck.stalled = true; - engine->hangcheck.seqno = - intel_engine_get_seqno(engine); - - err = i915_reset_engine(engine, I915_RESET_QUIET); + err = i915_reset_engine(engine, NULL); if (err) { pr_err("i915_reset_engine failed\n"); break; @@ -539,14 +503,25 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active) break; } - engine->hangcheck.stalled = false; + if (!wait_for_idle(engine)) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("%s failed to idle after reset\n", + engine->name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + err = -EIO; + break; + } } while (time_before(jiffies, end_time)); clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); if (err) break; - err = flush_test(i915, 0); + err = igt_flush_test(i915, 0); if (err) break; } @@ -573,11 +548,25 @@ static int igt_reset_active_engine(void *arg) return __igt_reset_engine(arg, true); } +struct active_engine { + struct task_struct *task; + struct intel_engine_cs *engine; + unsigned long resets; + unsigned int flags; +}; + +#define TEST_ACTIVE BIT(0) +#define TEST_OTHERS BIT(1) +#define TEST_SELF BIT(2) +#define TEST_PRIORITY BIT(3) + static int active_engine(void *data) { - struct intel_engine_cs *engine = data; - struct i915_request *rq[2] = {}; - struct i915_gem_context *ctx[2]; + I915_RND_STATE(prng); + struct active_engine *arg = data; + struct intel_engine_cs *engine = arg->engine; + struct i915_request *rq[8] = {}; + struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; struct drm_file *file; unsigned long count = 0; int err = 0; @@ -586,25 +575,20 @@ static int active_engine(void *data) if (IS_ERR(file)) return PTR_ERR(file); - mutex_lock(&engine->i915->drm.struct_mutex); - ctx[0] = live_context(engine->i915, file); - mutex_unlock(&engine->i915->drm.struct_mutex); - if (IS_ERR(ctx[0])) { - err = PTR_ERR(ctx[0]); - goto err_file; - } - - mutex_lock(&engine->i915->drm.struct_mutex); - ctx[1] = live_context(engine->i915, file); - mutex_unlock(&engine->i915->drm.struct_mutex); - if (IS_ERR(ctx[1])) { - err = PTR_ERR(ctx[1]); - i915_gem_context_put(ctx[0]); - goto err_file; + for (count = 0; count < ARRAY_SIZE(ctx); count++) { + mutex_lock(&engine->i915->drm.struct_mutex); + ctx[count] = live_context(engine->i915, file); + mutex_unlock(&engine->i915->drm.struct_mutex); + if (IS_ERR(ctx[count])) { + err = PTR_ERR(ctx[count]); + while (--count) + i915_gem_context_put(ctx[count]); + goto err_file; + } } while (!kthread_should_stop()) { - unsigned int idx = count++ & 1; + unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); struct i915_request *old = rq[idx]; struct i915_request *new; @@ -616,14 +600,28 @@ static int active_engine(void *data) break; } + if (arg->flags & TEST_PRIORITY) + ctx[idx]->sched.priority = + i915_prandom_u32_max_state(512, &prng); + rq[idx] = i915_request_get(new); i915_request_add(new); mutex_unlock(&engine->i915->drm.struct_mutex); if (old) { - i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT); + if (i915_request_wait(old, 0, HZ) < 0) { + GEM_TRACE("%s timed out.\n", engine->name); + GEM_TRACE_DUMP(); + + i915_gem_set_wedged(engine->i915); + i915_request_put(old); + err = -EIO; + break; + } i915_request_put(old); } + + cond_resched(); } for (count = 0; count < ARRAY_SIZE(rq); count++) @@ -634,8 +632,9 @@ err_file: return err; } -static int __igt_reset_engine_others(struct drm_i915_private *i915, - bool active) +static int __igt_reset_engines(struct drm_i915_private *i915, + const char *test_name, + unsigned int flags) { struct intel_engine_cs *engine, *other; enum intel_engine_id id, tmp; @@ -649,50 +648,68 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, if (!intel_has_reset_engine(i915)) return 0; - if (active) { + if (flags & TEST_ACTIVE) { mutex_lock(&i915->drm.struct_mutex); err = hang_init(&h, i915); mutex_unlock(&i915->drm.struct_mutex); if (err) return err; + + if (flags & TEST_PRIORITY) + h.ctx->sched.priority = 1024; } for_each_engine(engine, i915, id) { - struct task_struct *threads[I915_NUM_ENGINES] = {}; - unsigned long resets[I915_NUM_ENGINES]; + struct active_engine threads[I915_NUM_ENGINES] = {}; unsigned long global = i915_reset_count(&i915->gpu_error); - unsigned long count = 0; + unsigned long count = 0, reported; IGT_TIMEOUT(end_time); - if (active && !intel_engine_can_store_dword(engine)) + if (flags & TEST_ACTIVE && + !intel_engine_can_store_dword(engine)) continue; + if (!wait_for_idle(engine)) { + pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", + engine->name, test_name); + err = -EIO; + break; + } + memset(threads, 0, sizeof(threads)); for_each_engine(other, i915, tmp) { struct task_struct *tsk; - resets[tmp] = i915_reset_engine_count(&i915->gpu_error, - other); + threads[tmp].resets = + i915_reset_engine_count(&i915->gpu_error, + other); + + if (!(flags & TEST_OTHERS)) + continue; - if (other == engine) + if (other == engine && !(flags & TEST_SELF)) continue; - tsk = kthread_run(active_engine, other, + threads[tmp].engine = other; + threads[tmp].flags = flags; + + tsk = kthread_run(active_engine, &threads[tmp], "igt/%s", other->name); if (IS_ERR(tsk)) { err = PTR_ERR(tsk); goto unwind; } - threads[tmp] = tsk; + threads[tmp].task = tsk; get_task_struct(tsk); } set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); do { - if (active) { - struct i915_request *rq; + u32 seqno = intel_engine_get_seqno(engine); + struct i915_request *rq = NULL; + if (flags & TEST_ACTIVE) { mutex_lock(&i915->drm.struct_mutex); rq = hang_create_request(&h, engine); if (IS_ERR(rq)) { @@ -705,7 +722,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, __i915_request_add(rq, true); mutex_unlock(&i915->drm.struct_mutex); - if (!wait_for_hang(&h, rq)) { + if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); pr_err("%s: Failed to start request %x, at %x\n", @@ -718,33 +735,48 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915, break; } - i915_request_put(rq); + GEM_BUG_ON(!rq->global_seqno); + seqno = rq->global_seqno - 1; } - engine->hangcheck.stalled = true; - engine->hangcheck.seqno = - intel_engine_get_seqno(engine); - - err = i915_reset_engine(engine, I915_RESET_QUIET); + err = i915_reset_engine(engine, NULL); if (err) { - pr_err("i915_reset_engine(%s:%s) failed, err=%d\n", - engine->name, active ? "active" : "idle", err); + pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", + engine->name, test_name, err); break; } - engine->hangcheck.stalled = false; count++; + + if (rq) { + i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); + i915_request_put(rq); + } + + if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("i915_reset_engine(%s:%s):" + " failed to idle after reset\n", + engine->name, test_name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + err = -EIO; + break; + } } while (time_before(jiffies, end_time)); clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); pr_info("i915_reset_engine(%s:%s): %lu resets\n", - engine->name, active ? "active" : "idle", count); - - if (i915_reset_engine_count(&i915->gpu_error, engine) - - resets[engine->id] != (active ? count : 0)) { - pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", - engine->name, active ? "active" : "idle", count, - i915_reset_engine_count(&i915->gpu_error, - engine) - resets[engine->id]); + engine->name, test_name, count); + + reported = i915_reset_engine_count(&i915->gpu_error, engine); + reported -= threads[engine->id].resets; + if (reported != (flags & TEST_ACTIVE ? count : 0)) { + pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n", + engine->name, test_name, count, reported, + (flags & TEST_ACTIVE ? count : 0)); if (!err) err = -EINVAL; } @@ -753,24 +785,26 @@ unwind: for_each_engine(other, i915, tmp) { int ret; - if (!threads[tmp]) + if (!threads[tmp].task) continue; - ret = kthread_stop(threads[tmp]); + ret = kthread_stop(threads[tmp].task); if (ret) { pr_err("kthread for other engine %s failed, err=%d\n", other->name, ret); if (!err) err = ret; } - put_task_struct(threads[tmp]); + put_task_struct(threads[tmp].task); - if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error, - other)) { + if (other != engine && + threads[tmp].resets != + i915_reset_engine_count(&i915->gpu_error, other)) { pr_err("Innocent engine %s was reset (count=%ld)\n", other->name, i915_reset_engine_count(&i915->gpu_error, - other) - resets[tmp]); + other) - + threads[tmp].resets); if (!err) err = -EINVAL; } @@ -786,7 +820,7 @@ unwind: if (err) break; - err = flush_test(i915, 0); + err = igt_flush_test(i915, 0); if (err) break; } @@ -794,7 +828,7 @@ unwind: if (i915_terminally_wedged(&i915->gpu_error)) err = -EIO; - if (active) { + if (flags & TEST_ACTIVE) { mutex_lock(&i915->drm.struct_mutex); hang_fini(&h); mutex_unlock(&i915->drm.struct_mutex); @@ -803,27 +837,56 @@ unwind: return err; } -static int igt_reset_idle_engine_others(void *arg) +static int igt_reset_engines(void *arg) { - return __igt_reset_engine_others(arg, false); -} + static const struct { + const char *name; + unsigned int flags; + } phases[] = { + { "idle", 0 }, + { "active", TEST_ACTIVE }, + { "others-idle", TEST_OTHERS }, + { "others-active", TEST_OTHERS | TEST_ACTIVE }, + { + "others-priority", + TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY + }, + { + "self-priority", + TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, + }, + { } + }; + struct drm_i915_private *i915 = arg; + typeof(*phases) *p; + int err; -static int igt_reset_active_engine_others(void *arg) -{ - return __igt_reset_engine_others(arg, true); + for (p = phases; p->name; p++) { + if (p->flags & TEST_PRIORITY) { + if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) + continue; + } + + err = __igt_reset_engines(arg, p->name, p->flags); + if (err) + return err; + } + + return 0; } -static u32 fake_hangcheck(struct i915_request *rq) +static u32 fake_hangcheck(struct i915_request *rq, u32 mask) { - u32 reset_count; + struct i915_gpu_error *error = &rq->i915->gpu_error; + u32 reset_count = i915_reset_count(error); - rq->engine->hangcheck.stalled = true; - rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine); + error->stalled_mask = mask; - reset_count = i915_reset_count(&rq->i915->gpu_error); + /* set_bit() must be after we have setup the backchannel (mask) */ + smp_mb__before_atomic(); + set_bit(I915_RESET_HANDOFF, &error->flags); - set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags); - wake_up_all(&rq->i915->gpu_error.wait_queue); + wake_up_all(&error->wait_queue); return reset_count; } @@ -858,21 +921,20 @@ static int igt_wait_reset(void *arg) i915_request_get(rq); __i915_request_add(rq, true); - if (!wait_for_hang(&h, rq)) { + if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); pr_err("%s: Failed to start request %x, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; goto out_rq; } - reset_count = fake_hangcheck(rq); + reset_count = fake_hangcheck(rq, ALL_ENGINES); timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10); if (timeout < 0) { @@ -903,6 +965,23 @@ unlock: return err; } +static int wait_for_others(struct drm_i915_private *i915, + struct intel_engine_cs *exclude) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + if (engine == exclude) + continue; + + if (!wait_for_idle(engine)) + return -EIO; + } + + return 0; +} + static int igt_reset_queue(void *arg) { struct drm_i915_private *i915 = arg; @@ -951,27 +1030,49 @@ static int igt_reset_queue(void *arg) i915_request_get(rq); __i915_request_add(rq, true); - if (!wait_for_hang(&h, prev)) { + /* + * XXX We don't handle resetting the kernel context + * very well. If we trigger a device reset twice in + * quick succession while the kernel context is + * executing, we may end up skipping the breadcrumb. + * This is really only a problem for the selftest as + * normally there is a large interlude between resets + * (hangcheck), or we focus on resetting just one + * engine and so avoid repeatedly resetting innocents. + */ + err = wait_for_others(i915, engine); + if (err) { + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", + __func__, engine->name); + i915_request_put(rq); + i915_request_put(prev); + + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + goto fini; + } + + if (!wait_until_running(&h, prev)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", - __func__, prev->fence.seqno, hws_seqno(&h, prev)); - intel_engine_dump(prev->engine, &p, - "%s\n", prev->engine->name); + pr_err("%s(%s): Failed to start request %x, at %x\n", + __func__, engine->name, + prev->fence.seqno, hws_seqno(&h, prev)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); i915_request_put(rq); i915_request_put(prev); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; goto fini; } - reset_count = fake_hangcheck(prev); + reset_count = fake_hangcheck(prev, ENGINE_MASK(id)); - i915_reset(i915, I915_RESET_QUIET); + i915_reset(i915, ENGINE_MASK(id), NULL); GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags)); @@ -1013,7 +1114,7 @@ static int igt_reset_queue(void *arg) i915_request_put(prev); - err = flush_test(i915, I915_WAIT_LOCKED); + err = igt_flush_test(i915, I915_WAIT_LOCKED); if (err) break; } @@ -1044,7 +1145,7 @@ static int igt_handle_error(void *arg) if (!intel_has_reset_engine(i915)) return 0; - if (!intel_engine_can_store_dword(i915->engine[RCS])) + if (!engine || !intel_engine_can_store_dword(engine)) return 0; mutex_lock(&i915->drm.struct_mutex); @@ -1062,14 +1163,13 @@ static int igt_handle_error(void *arg) i915_request_get(rq); __i915_request_add(rq, true); - if (!wait_for_hang(&h, rq)) { + if (!wait_until_running(&h, rq)) { struct drm_printer p = drm_info_printer(i915->drm.dev); pr_err("%s: Failed to start request %x, at %x\n", __func__, rq->fence.seqno, hws_seqno(&h, rq)); intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); - i915_reset(i915, 0); i915_gem_set_wedged(i915); err = -EIO; @@ -1081,10 +1181,7 @@ static int igt_handle_error(void *arg) /* Temporarily disable error capture */ error = xchg(&i915->gpu_error.first_error, (void *)-1); - engine->hangcheck.stalled = true; - engine->hangcheck.seqno = intel_engine_get_seqno(engine); - - i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__); + i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL); xchg(&i915->gpu_error.first_error, error); @@ -1112,8 +1209,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) SUBTEST(igt_hang_sanitycheck), SUBTEST(igt_reset_idle_engine), SUBTEST(igt_reset_active_engine), - SUBTEST(igt_reset_idle_engine_others), - SUBTEST(igt_reset_active_engine_others), + SUBTEST(igt_reset_engines), SUBTEST(igt_wait_reset), SUBTEST(igt_reset_queue), SUBTEST(igt_handle_error), @@ -1129,6 +1225,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915) err = i915_subtests(tests, i915); + mutex_lock(&i915->drm.struct_mutex); + igt_flush_test(i915, I915_WAIT_LOCKED); + mutex_unlock(&i915->drm.struct_mutex); + i915_modparams.enable_hangcheck = saved_hangcheck; intel_runtime_pm_put(i915); |