summaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/i915/i915_irq.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2017-06-22 13:56:25 +0300
committerChris Wilson <chris@chris-wilson.co.uk>2017-06-23 15:41:55 +0300
commit36703e79a982c8ce5a8e43833291f2719e92d0d1 (patch)
tree0118684ecb324be39f64eab500a0e98de98f1ba0 /drivers/gpu/drm/i915/i915_irq.c
parentae25eceab616d16a07bcaa434b84463d58a3bdc3 (diff)
downloadlinux-36703e79a982c8ce5a8e43833291f2719e92d0d1.tar.xz
drm/i915: Break modeset deadlocks on reset
Trying to do a modeset from within a reset is fraught with danger. We can fall into a cyclic deadlock where the modeset is waiting on a previous modeset that is waiting on a request, and since the GPU hung that request completion is waiting on the reset. As modesetting doesn't allow its locks to be broken and restarted, or for its *own* reset mechanism to take over the display, we have to do something very evil instead. If we detect that we are stuck waiting to prepare the display reset (by using a very simple timeout), resort to cancelling all in-flight requests and throwing the user data into /dev/null, which is marginally better than the driver locking up and keeping that data to itself. This is not a fix; this is just a workaround that unbreaks machines until we can resolve the deadlock in a way that doesn't lose data! v2: Move the retirement from set-wegded to the i915_reset() error path, after which we no longer any delayed worker cleanup for i915_handle_error() v3: C abuse for syntactic sugar v4: Cover all waits with the timeout to catch more driver breakage References: https://bugs.freedesktop.org/show_bug.cgi?id=99093 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170622105625.16952-1-chris@chris-wilson.co.uk Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Diffstat (limited to 'drivers/gpu/drm/i915/i915_irq.c')
-rw-r--r--drivers/gpu/drm/i915/i915_irq.c80
1 files changed, 60 insertions, 20 deletions
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f25e73fe567c..e4934d5adc9e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
return ret;
}
+struct wedge_me {
+ struct delayed_work work;
+ struct drm_i915_private *i915;
+ const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+ struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+ dev_err(w->i915->drm.dev,
+ "%s timed out, cancelling all in-flight rendering.\n",
+ w->name);
+ i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+ struct drm_i915_private *i915,
+ long timeout,
+ const char *name)
+{
+ w->i915 = i915;
+ w->name = name;
+
+ INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+ schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+ cancel_delayed_work_sync(&w->work);
+ destroy_delayed_work_on_stack(&w->work);
+ w->i915 = NULL;
+}
+
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT) \
+ for (__init_wedge((W), (DEV), (TIMEOUT), __func__); \
+ (W)->i915; \
+ __fini_wedge((W)))
+
/**
* i915_reset_device - do process context error handling work
* @dev_priv: i915 device private
@@ -2612,36 +2652,36 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+ struct wedge_me w;
kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
DRM_DEBUG_DRIVER("resetting chip\n");
kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
- intel_prepare_reset(dev_priv);
+ /* Use a watchdog to ensure that our reset completes */
+ i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
+ intel_prepare_reset(dev_priv);
- set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
- wake_up_all(&dev_priv->gpu_error.wait_queue);
+ /* Signal that locked waiters should reset the GPU */
+ set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+ wake_up_all(&dev_priv->gpu_error.wait_queue);
- do {
- /*
- * All state reset _must_ be completed before we update the
- * reset counter, for otherwise waiters might miss the reset
- * pending state and not properly drop locks, resulting in
- * deadlocks with the reset work.
+ /* Wait for anyone holding the lock to wakeup, without
+ * blocking indefinitely on struct_mutex.
*/
- if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
- i915_reset(dev_priv);
- mutex_unlock(&dev_priv->drm.struct_mutex);
- }
-
- /* We need to wait for anyone holding the lock to wakeup */
- } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
- I915_RESET_HANDOFF,
- TASK_UNINTERRUPTIBLE,
- HZ));
+ do {
+ if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
+ i915_reset(dev_priv);
+ mutex_unlock(&dev_priv->drm.struct_mutex);
+ }
+ } while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
+ I915_RESET_HANDOFF,
+ TASK_UNINTERRUPTIBLE,
+ 1));
- intel_finish_reset(dev_priv);
+ intel_finish_reset(dev_priv);
+ }
if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
kobject_uevent_env(kobj,