diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2017-06-22 13:56:25 +0300 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2017-06-23 15:41:55 +0300 |
commit | 36703e79a982c8ce5a8e43833291f2719e92d0d1 (patch) | |
tree | 0118684ecb324be39f64eab500a0e98de98f1ba0 /drivers/gpu/drm/i915/i915_gem.c | |
parent | ae25eceab616d16a07bcaa434b84463d58a3bdc3 (diff) | |
download | linux-36703e79a982c8ce5a8e43833291f2719e92d0d1.tar.xz |
drm/i915: Break modeset deadlocks on reset
Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.
This is not a fix; this is just a workaround that unbreaks machines
until we can resolve the deadlock in a way that doesn't lose data!
v2: Move the retirement from set-wegded to the i915_reset() error path,
after which we no longer any delayed worker cleanup for
i915_handle_error()
v3: C abuse for syntactic sugar
v4: Cover all waits with the timeout to catch more driver breakage
References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20170622105625.16952-1-chris@chris-wilson.co.uk
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gem.c')
-rw-r--r-- | drivers/gpu/drm/i915/i915_gem.c | 18 |
1 files changed, 4 insertions, 14 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index ae3ce1314bd1..36d838677982 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine) /* Mark all executing requests as skipped */ spin_lock_irqsave(&engine->timeline->lock, flags); list_for_each_entry(request, &engine->timeline->requests, link) - dma_fence_set_error(&request->fence, -EIO); + if (!i915_gem_request_completed(request)) + dma_fence_set_error(&request->fence, -EIO); spin_unlock_irqrestore(&engine->timeline->lock, flags); /* Mark all pending requests as complete so that any concurrent @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data) struct intel_engine_cs *engine; enum intel_engine_id id; + set_bit(I915_WEDGED, &i915->gpu_error.flags); for_each_engine(engine, i915, id) engine_set_wedged(engine); @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data) void i915_gem_set_wedged(struct drm_i915_private *dev_priv) { - lockdep_assert_held(&dev_priv->drm.struct_mutex); - set_bit(I915_WEDGED, &dev_priv->gpu_error.flags); - - /* Retire completed requests first so the list of inflight/incomplete - * requests is accurate and we don't try and mark successful requests - * as in error during __i915_gem_set_wedged_BKL(). - */ - i915_gem_retire_requests(dev_priv); - stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL); - - i915_gem_contexts_lost(dev_priv); - - mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0); } bool i915_gem_unset_wedged(struct drm_i915_private *i915) @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915) * context and do not require stop_machine(). */ intel_engines_reset_default_submission(i915); + i915_gem_contexts_lost(i915); smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ clear_bit(I915_WEDGED, &i915->gpu_error.flags); |