diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt')
35 files changed, 20122 insertions, 0 deletions
diff --git a/drivers/gpu/drm/i915/gt/Makefile b/drivers/gpu/drm/i915/gt/Makefile new file mode 100644 index 000000000000..1c75b5c9790c --- /dev/null +++ b/drivers/gpu/drm/i915/gt/Makefile @@ -0,0 +1,2 @@ +# Extra header tests +include $(src)/Makefile.header-test diff --git a/drivers/gpu/drm/i915/gt/Makefile.header-test b/drivers/gpu/drm/i915/gt/Makefile.header-test new file mode 100644 index 000000000000..61e06cbb4b32 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/Makefile.header-test @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: MIT +# Copyright © 2019 Intel Corporation + +# Test the headers are compilable as standalone units +header_test := $(notdir $(wildcard $(src)/*.h)) + +quiet_cmd_header_test = HDRTEST $@ + cmd_header_test = echo "\#include \"$(<F)\"" > $@ + +header_test_%.c: %.h + $(call cmd,header_test) + +extra-$(CONFIG_DRM_I915_WERROR) += \ + $(foreach h,$(header_test),$(patsubst %.h,header_test_%.o,$(h))) + +clean-files += $(foreach h,$(header_test),$(patsubst %.h,header_test_%.c,$(h))) diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c new file mode 100644 index 000000000000..c092bdf5f0bf --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -0,0 +1,392 @@ +/* + * Copyright © 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <linux/kthread.h> +#include <trace/events/dma_fence.h> +#include <uapi/linux/sched/types.h> + +#include "i915_drv.h" + +static void irq_enable(struct intel_engine_cs *engine) +{ + if (!engine->irq_enable) + return; + + /* Caller disables interrupts */ + spin_lock(&engine->i915->irq_lock); + engine->irq_enable(engine); + spin_unlock(&engine->i915->irq_lock); +} + +static void irq_disable(struct intel_engine_cs *engine) +{ + if (!engine->irq_disable) + return; + + /* Caller disables interrupts */ + spin_lock(&engine->i915->irq_lock); + engine->irq_disable(engine); + spin_unlock(&engine->i915->irq_lock); +} + +static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b) +{ + lockdep_assert_held(&b->irq_lock); + + GEM_BUG_ON(!b->irq_enabled); + if (!--b->irq_enabled) + irq_disable(container_of(b, + struct intel_engine_cs, + breadcrumbs)); + + b->irq_armed = false; +} + +void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + + if (!b->irq_armed) + return; + + spin_lock_irq(&b->irq_lock); + if (b->irq_armed) + __intel_breadcrumbs_disarm_irq(b); + spin_unlock_irq(&b->irq_lock); +} + +static inline bool __request_completed(const struct i915_request *rq) +{ + return i915_seqno_passed(__hwsp_seqno(rq), rq->fence.seqno); +} + +__maybe_unused static bool +check_signal_order(struct intel_context *ce, struct i915_request *rq) +{ + if (!list_is_last(&rq->signal_link, &ce->signals) && + i915_seqno_passed(rq->fence.seqno, + list_next_entry(rq, signal_link)->fence.seqno)) + return false; + + if (!list_is_first(&rq->signal_link, &ce->signals) && + i915_seqno_passed(list_prev_entry(rq, signal_link)->fence.seqno, + rq->fence.seqno)) + return false; + + return true; +} + +static bool +__dma_fence_signal(struct dma_fence *fence) +{ + return !test_and_set_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags); +} + +static void +__dma_fence_signal__timestamp(struct dma_fence *fence, ktime_t timestamp) +{ + fence->timestamp = timestamp; + set_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, &fence->flags); + trace_dma_fence_signaled(fence); +} + +static void +__dma_fence_signal__notify(struct dma_fence *fence) +{ + struct dma_fence_cb *cur, *tmp; + + lockdep_assert_held(fence->lock); + lockdep_assert_irqs_disabled(); + + list_for_each_entry_safe(cur, tmp, &fence->cb_list, node) { + INIT_LIST_HEAD(&cur->node); + cur->func(fence, cur); + } + INIT_LIST_HEAD(&fence->cb_list); +} + +void intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + const ktime_t timestamp = ktime_get(); + struct intel_context *ce, *cn; + struct list_head *pos, *next; + LIST_HEAD(signal); + + spin_lock(&b->irq_lock); + + if (b->irq_armed && list_empty(&b->signalers)) + __intel_breadcrumbs_disarm_irq(b); + + list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) { + GEM_BUG_ON(list_empty(&ce->signals)); + + list_for_each_safe(pos, next, &ce->signals) { + struct i915_request *rq = + list_entry(pos, typeof(*rq), signal_link); + + GEM_BUG_ON(!check_signal_order(ce, rq)); + + if (!__request_completed(rq)) + break; + + GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_SIGNAL, + &rq->fence.flags)); + clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + + if (!__dma_fence_signal(&rq->fence)) + continue; + + /* + * Queue for execution after dropping the signaling + * spinlock as the callback chain may end up adding + * more signalers to the same context or engine. + */ + i915_request_get(rq); + list_add_tail(&rq->signal_link, &signal); + } + + /* + * We process the list deletion in bulk, only using a list_add + * (not list_move) above but keeping the status of + * rq->signal_link known with the I915_FENCE_FLAG_SIGNAL bit. + */ + if (!list_is_first(pos, &ce->signals)) { + /* Advance the list to the first incomplete request */ + __list_del_many(&ce->signals, pos); + if (&ce->signals == pos) /* now empty */ + list_del_init(&ce->signal_link); + } + } + + spin_unlock(&b->irq_lock); + + list_for_each_safe(pos, next, &signal) { + struct i915_request *rq = + list_entry(pos, typeof(*rq), signal_link); + + __dma_fence_signal__timestamp(&rq->fence, timestamp); + + spin_lock(&rq->lock); + __dma_fence_signal__notify(&rq->fence); + spin_unlock(&rq->lock); + + i915_request_put(rq); + } +} + +void intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine) +{ + local_irq_disable(); + intel_engine_breadcrumbs_irq(engine); + local_irq_enable(); +} + +static void signal_irq_work(struct irq_work *work) +{ + struct intel_engine_cs *engine = + container_of(work, typeof(*engine), breadcrumbs.irq_work); + + intel_engine_breadcrumbs_irq(engine); +} + +void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + + spin_lock_irq(&b->irq_lock); + if (!b->irq_enabled++) + irq_enable(engine); + GEM_BUG_ON(!b->irq_enabled); /* no overflow! */ + spin_unlock_irq(&b->irq_lock); +} + +void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + + spin_lock_irq(&b->irq_lock); + GEM_BUG_ON(!b->irq_enabled); /* no underflow! */ + if (!--b->irq_enabled) + irq_disable(engine); + spin_unlock_irq(&b->irq_lock); +} + +static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b) +{ + struct intel_engine_cs *engine = + container_of(b, struct intel_engine_cs, breadcrumbs); + + lockdep_assert_held(&b->irq_lock); + if (b->irq_armed) + return; + + /* + * The breadcrumb irq will be disarmed on the interrupt after the + * waiters are signaled. This gives us a single interrupt window in + * which we can add a new waiter and avoid the cost of re-enabling + * the irq. + */ + b->irq_armed = true; + + /* + * Since we are waiting on a request, the GPU should be busy + * and should have its own rpm reference. This is tracked + * by i915->gt.awake, we can forgo holding our own wakref + * for the interrupt as before i915->gt.awake is released (when + * the driver is idle) we disarm the breadcrumbs. + */ + + if (!b->irq_enabled++) + irq_enable(engine); +} + +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + + spin_lock_init(&b->irq_lock); + INIT_LIST_HEAD(&b->signalers); + + init_irq_work(&b->irq_work, signal_irq_work); +} + +void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + unsigned long flags; + + spin_lock_irqsave(&b->irq_lock, flags); + + if (b->irq_enabled) + irq_enable(engine); + else + irq_disable(engine); + + spin_unlock_irqrestore(&b->irq_lock, flags); +} + +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine) +{ +} + +bool i915_request_enable_breadcrumb(struct i915_request *rq) +{ + lockdep_assert_held(&rq->lock); + lockdep_assert_irqs_disabled(); + + if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) { + struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; + struct intel_context *ce = rq->hw_context; + struct list_head *pos; + + spin_lock(&b->irq_lock); + GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)); + + __intel_breadcrumbs_arm_irq(b); + + /* + * We keep the seqno in retirement order, so we can break + * inside intel_engine_breadcrumbs_irq as soon as we've passed + * the last completed request (or seen a request that hasn't + * event started). We could iterate the timeline->requests list, + * but keeping a separate signalers_list has the advantage of + * hopefully being much smaller than the full list and so + * provides faster iteration and detection when there are no + * more interrupts required for this context. + * + * We typically expect to add new signalers in order, so we + * start looking for our insertion point from the tail of + * the list. + */ + list_for_each_prev(pos, &ce->signals) { + struct i915_request *it = + list_entry(pos, typeof(*it), signal_link); + + if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno)) + break; + } + list_add(&rq->signal_link, pos); + if (pos == &ce->signals) /* catch transitions from empty list */ + list_move_tail(&ce->signal_link, &b->signalers); + GEM_BUG_ON(!check_signal_order(ce, rq)); + + set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + spin_unlock(&b->irq_lock); + } + + return !__request_completed(rq); +} + +void i915_request_cancel_breadcrumb(struct i915_request *rq) +{ + struct intel_breadcrumbs *b = &rq->engine->breadcrumbs; + + lockdep_assert_held(&rq->lock); + lockdep_assert_irqs_disabled(); + + /* + * We must wait for b->irq_lock so that we know the interrupt handler + * has released its reference to the intel_context and has completed + * the DMA_FENCE_FLAG_SIGNALED_BIT/I915_FENCE_FLAG_SIGNAL dance (if + * required). + */ + spin_lock(&b->irq_lock); + if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) { + struct intel_context *ce = rq->hw_context; + + list_del(&rq->signal_link); + if (list_empty(&ce->signals)) + list_del_init(&ce->signal_link); + + clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags); + } + spin_unlock(&b->irq_lock); +} + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p) +{ + struct intel_breadcrumbs *b = &engine->breadcrumbs; + struct intel_context *ce; + struct i915_request *rq; + + if (list_empty(&b->signalers)) + return; + + drm_printf(p, "Signals:\n"); + + spin_lock_irq(&b->irq_lock); + list_for_each_entry(ce, &b->signalers, signal_link) { + list_for_each_entry(rq, &ce->signals, signal_link) { + drm_printf(p, "\t[%llx:%llx%s] @ %dms\n", + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + jiffies_to_msecs(jiffies - rq->emitted_jiffies)); + } + } + spin_unlock_irq(&b->irq_lock); +} diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c new file mode 100644 index 000000000000..2c454f227c2e --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -0,0 +1,241 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "gem/i915_gem_context.h" +#include "gem/i915_gem_pm.h" + +#include "i915_drv.h" +#include "i915_globals.h" + +#include "intel_context.h" +#include "intel_engine.h" +#include "intel_engine_pm.h" + +static struct i915_global_context { + struct i915_global base; + struct kmem_cache *slab_ce; +} global; + +static struct intel_context *intel_context_alloc(void) +{ + return kmem_cache_zalloc(global.slab_ce, GFP_KERNEL); +} + +void intel_context_free(struct intel_context *ce) +{ + kmem_cache_free(global.slab_ce, ce); +} + +struct intel_context * +intel_context_create(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + struct intel_context *ce; + + ce = intel_context_alloc(); + if (!ce) + return ERR_PTR(-ENOMEM); + + intel_context_init(ce, ctx, engine); + return ce; +} + +int __intel_context_do_pin(struct intel_context *ce) +{ + int err; + + if (mutex_lock_interruptible(&ce->pin_mutex)) + return -EINTR; + + if (likely(!atomic_read(&ce->pin_count))) { + intel_wakeref_t wakeref; + + err = 0; + with_intel_runtime_pm(&ce->engine->i915->runtime_pm, wakeref) + err = ce->ops->pin(ce); + if (err) + goto err; + + i915_gem_context_get(ce->gem_context); /* for ctx->ppgtt */ + + smp_mb__before_atomic(); /* flush pin before it is visible */ + } + + atomic_inc(&ce->pin_count); + GEM_BUG_ON(!intel_context_is_pinned(ce)); /* no overflow! */ + + mutex_unlock(&ce->pin_mutex); + return 0; + +err: + mutex_unlock(&ce->pin_mutex); + return err; +} + +void intel_context_unpin(struct intel_context *ce) +{ + if (likely(atomic_add_unless(&ce->pin_count, -1, 1))) + return; + + /* We may be called from inside intel_context_pin() to evict another */ + intel_context_get(ce); + mutex_lock_nested(&ce->pin_mutex, SINGLE_DEPTH_NESTING); + + if (likely(atomic_dec_and_test(&ce->pin_count))) { + ce->ops->unpin(ce); + + i915_gem_context_put(ce->gem_context); + intel_context_active_release(ce); + } + + mutex_unlock(&ce->pin_mutex); + intel_context_put(ce); +} + +static int __context_pin_state(struct i915_vma *vma, unsigned long flags) +{ + int err; + + err = i915_vma_pin(vma, 0, 0, flags | PIN_GLOBAL); + if (err) + return err; + + /* + * And mark it as a globally pinned object to let the shrinker know + * it cannot reclaim the object until we release it. + */ + vma->obj->pin_global++; + vma->obj->mm.dirty = true; + + return 0; +} + +static void __context_unpin_state(struct i915_vma *vma) +{ + vma->obj->pin_global--; + __i915_vma_unpin(vma); +} + +static void intel_context_retire(struct i915_active *active) +{ + struct intel_context *ce = container_of(active, typeof(*ce), active); + + if (ce->state) + __context_unpin_state(ce->state); + + intel_context_put(ce); +} + +void +intel_context_init(struct intel_context *ce, + struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!engine->cops); + + kref_init(&ce->ref); + + ce->gem_context = ctx; + ce->engine = engine; + ce->ops = engine->cops; + ce->sseu = engine->sseu; + + INIT_LIST_HEAD(&ce->signal_link); + INIT_LIST_HEAD(&ce->signals); + + mutex_init(&ce->pin_mutex); + + i915_active_init(ctx->i915, &ce->active, intel_context_retire); +} + +int intel_context_active_acquire(struct intel_context *ce, unsigned long flags) +{ + int err; + + if (!i915_active_acquire(&ce->active)) + return 0; + + intel_context_get(ce); + + if (!ce->state) + return 0; + + err = __context_pin_state(ce->state, flags); + if (err) { + i915_active_cancel(&ce->active); + intel_context_put(ce); + return err; + } + + /* Preallocate tracking nodes */ + if (!i915_gem_context_is_kernel(ce->gem_context)) { + err = i915_active_acquire_preallocate_barrier(&ce->active, + ce->engine); + if (err) { + i915_active_release(&ce->active); + return err; + } + } + + return 0; +} + +void intel_context_active_release(struct intel_context *ce) +{ + /* Nodes preallocated in intel_context_active() */ + i915_active_acquire_barrier(&ce->active); + i915_active_release(&ce->active); +} + +static void i915_global_context_shrink(void) +{ + kmem_cache_shrink(global.slab_ce); +} + +static void i915_global_context_exit(void) +{ + kmem_cache_destroy(global.slab_ce); +} + +static struct i915_global_context global = { { + .shrink = i915_global_context_shrink, + .exit = i915_global_context_exit, +} }; + +int __init i915_global_context_init(void) +{ + global.slab_ce = KMEM_CACHE(intel_context, SLAB_HWCACHE_ALIGN); + if (!global.slab_ce) + return -ENOMEM; + + i915_global_register(&global.base); + return 0; +} + +void intel_context_enter_engine(struct intel_context *ce) +{ + intel_engine_pm_get(ce->engine); +} + +void intel_context_exit_engine(struct intel_context *ce) +{ + intel_engine_pm_put(ce->engine); +} + +struct i915_request *intel_context_create_request(struct intel_context *ce) +{ + struct i915_request *rq; + int err; + + err = intel_context_pin(ce); + if (unlikely(err)) + return ERR_PTR(err); + + rq = i915_request_create(ce); + intel_context_unpin(ce); + + return rq; +} diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h new file mode 100644 index 000000000000..a47275bc4f01 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context.h @@ -0,0 +1,134 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_CONTEXT_H__ +#define __INTEL_CONTEXT_H__ + +#include <linux/lockdep.h> + +#include "intel_context_types.h" +#include "intel_engine_types.h" + +void intel_context_init(struct intel_context *ce, + struct i915_gem_context *ctx, + struct intel_engine_cs *engine); + +struct intel_context * +intel_context_create(struct i915_gem_context *ctx, + struct intel_engine_cs *engine); + +void intel_context_free(struct intel_context *ce); + +/** + * intel_context_lock_pinned - Stablises the 'pinned' status of the HW context + * @ce - the context + * + * Acquire a lock on the pinned status of the HW context, such that the context + * can neither be bound to the GPU or unbound whilst the lock is held, i.e. + * intel_context_is_pinned() remains stable. + */ +static inline int intel_context_lock_pinned(struct intel_context *ce) + __acquires(ce->pin_mutex) +{ + return mutex_lock_interruptible(&ce->pin_mutex); +} + +/** + * intel_context_is_pinned - Reports the 'pinned' status + * @ce - the context + * + * While in use by the GPU, the context, along with its ring and page + * tables is pinned into memory and the GTT. + * + * Returns: true if the context is currently pinned for use by the GPU. + */ +static inline bool +intel_context_is_pinned(struct intel_context *ce) +{ + return atomic_read(&ce->pin_count); +} + +/** + * intel_context_unlock_pinned - Releases the earlier locking of 'pinned' status + * @ce - the context + * + * Releases the lock earlier acquired by intel_context_unlock_pinned(). + */ +static inline void intel_context_unlock_pinned(struct intel_context *ce) + __releases(ce->pin_mutex) +{ + mutex_unlock(&ce->pin_mutex); +} + +int __intel_context_do_pin(struct intel_context *ce); + +static inline int intel_context_pin(struct intel_context *ce) +{ + if (likely(atomic_inc_not_zero(&ce->pin_count))) + return 0; + + return __intel_context_do_pin(ce); +} + +static inline void __intel_context_pin(struct intel_context *ce) +{ + GEM_BUG_ON(!intel_context_is_pinned(ce)); + atomic_inc(&ce->pin_count); +} + +void intel_context_unpin(struct intel_context *ce); + +void intel_context_enter_engine(struct intel_context *ce); +void intel_context_exit_engine(struct intel_context *ce); + +static inline void intel_context_enter(struct intel_context *ce) +{ + if (!ce->active_count++) + ce->ops->enter(ce); +} + +static inline void intel_context_mark_active(struct intel_context *ce) +{ + ++ce->active_count; +} + +static inline void intel_context_exit(struct intel_context *ce) +{ + GEM_BUG_ON(!ce->active_count); + if (!--ce->active_count) + ce->ops->exit(ce); +} + +int intel_context_active_acquire(struct intel_context *ce, unsigned long flags); +void intel_context_active_release(struct intel_context *ce); + +static inline struct intel_context *intel_context_get(struct intel_context *ce) +{ + kref_get(&ce->ref); + return ce; +} + +static inline void intel_context_put(struct intel_context *ce) +{ + kref_put(&ce->ref, ce->ops->destroy); +} + +static inline int __must_check +intel_context_timeline_lock(struct intel_context *ce) + __acquires(&ce->ring->timeline->mutex) +{ + return mutex_lock_interruptible(&ce->ring->timeline->mutex); +} + +static inline void intel_context_timeline_unlock(struct intel_context *ce) + __releases(&ce->ring->timeline->mutex) +{ + mutex_unlock(&ce->ring->timeline->mutex); +} + +struct i915_request *intel_context_create_request(struct intel_context *ce); + +#endif /* __INTEL_CONTEXT_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h new file mode 100644 index 000000000000..08049ee91cee --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h @@ -0,0 +1,68 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_CONTEXT_TYPES__ +#define __INTEL_CONTEXT_TYPES__ + +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/types.h> + +#include "i915_active_types.h" +#include "intel_engine_types.h" +#include "intel_sseu.h" + +struct i915_gem_context; +struct i915_vma; +struct intel_context; +struct intel_ring; + +struct intel_context_ops { + int (*pin)(struct intel_context *ce); + void (*unpin)(struct intel_context *ce); + + void (*enter)(struct intel_context *ce); + void (*exit)(struct intel_context *ce); + + void (*reset)(struct intel_context *ce); + void (*destroy)(struct kref *kref); +}; + +struct intel_context { + struct kref ref; + + struct i915_gem_context *gem_context; + struct intel_engine_cs *engine; + struct intel_engine_cs *inflight; + + struct list_head signal_link; + struct list_head signals; + + struct i915_vma *state; + struct intel_ring *ring; + + u32 *lrc_reg_state; + u64 lrc_desc; + + unsigned int active_count; /* notionally protected by timeline->mutex */ + + atomic_t pin_count; + struct mutex pin_mutex; /* guards pinning and associated on-gpuing */ + + /** + * active: Active tracker for the rq activity (inc. external) on this + * intel_context object. + */ + struct i915_active active; + + const struct intel_context_ops *ops; + + /** sseu: Control eu/slice partitioning */ + struct intel_sseu sseu; +}; + +#endif /* __INTEL_CONTEXT_TYPES__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h new file mode 100644 index 000000000000..2f1c6871ee95 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine.h @@ -0,0 +1,574 @@ +/* SPDX-License-Identifier: MIT */ +#ifndef _INTEL_RINGBUFFER_H_ +#define _INTEL_RINGBUFFER_H_ + +#include <drm/drm_util.h> + +#include <linux/hashtable.h> +#include <linux/irq_work.h> +#include <linux/random.h> +#include <linux/seqlock.h> + +#include "i915_gem_batch_pool.h" +#include "i915_pmu.h" +#include "i915_reg.h" +#include "i915_request.h" +#include "i915_selftest.h" +#include "i915_timeline.h" +#include "intel_engine_types.h" +#include "intel_gpu_commands.h" +#include "intel_workarounds.h" + +struct drm_printer; + +/* Early gen2 devices have a cacheline of just 32 bytes, using 64 is overkill, + * but keeps the logic simple. Indeed, the whole purpose of this macro is just + * to give some inclination as to some of the magic values used in the various + * workarounds! + */ +#define CACHELINE_BYTES 64 +#define CACHELINE_DWORDS (CACHELINE_BYTES / sizeof(u32)) + +/* + * The register defines to be used with the following macros need to accept a + * base param, e.g: + * + * REG_FOO(base) _MMIO((base) + <relative offset>) + * ENGINE_READ(engine, REG_FOO); + * + * register arrays are to be defined and accessed as follows: + * + * REG_BAR(base, i) _MMIO((base) + <relative offset> + (i) * <shift>) + * ENGINE_READ_IDX(engine, REG_BAR, i) + */ + +#define __ENGINE_REG_OP(op__, engine__, ...) \ + intel_uncore_##op__((engine__)->uncore, __VA_ARGS__) + +#define __ENGINE_READ_OP(op__, engine__, reg__) \ + __ENGINE_REG_OP(op__, (engine__), reg__((engine__)->mmio_base)) + +#define ENGINE_READ16(...) __ENGINE_READ_OP(read16, __VA_ARGS__) +#define ENGINE_READ(...) __ENGINE_READ_OP(read, __VA_ARGS__) +#define ENGINE_READ_FW(...) __ENGINE_READ_OP(read_fw, __VA_ARGS__) +#define ENGINE_POSTING_READ(...) __ENGINE_READ_OP(posting_read, __VA_ARGS__) +#define ENGINE_POSTING_READ16(...) __ENGINE_READ_OP(posting_read16, __VA_ARGS__) + +#define ENGINE_READ64(engine__, lower_reg__, upper_reg__) \ + __ENGINE_REG_OP(read64_2x32, (engine__), \ + lower_reg__((engine__)->mmio_base), \ + upper_reg__((engine__)->mmio_base)) + +#define ENGINE_READ_IDX(engine__, reg__, idx__) \ + __ENGINE_REG_OP(read, (engine__), reg__((engine__)->mmio_base, (idx__))) + +#define __ENGINE_WRITE_OP(op__, engine__, reg__, val__) \ + __ENGINE_REG_OP(op__, (engine__), reg__((engine__)->mmio_base), (val__)) + +#define ENGINE_WRITE16(...) __ENGINE_WRITE_OP(write16, __VA_ARGS__) +#define ENGINE_WRITE(...) __ENGINE_WRITE_OP(write, __VA_ARGS__) +#define ENGINE_WRITE_FW(...) __ENGINE_WRITE_OP(write_fw, __VA_ARGS__) + +#define GEN6_RING_FAULT_REG_READ(engine__) \ + intel_uncore_read((engine__)->uncore, RING_FAULT_REG(engine__)) + +#define GEN6_RING_FAULT_REG_POSTING_READ(engine__) \ + intel_uncore_posting_read((engine__)->uncore, RING_FAULT_REG(engine__)) + +#define GEN6_RING_FAULT_REG_RMW(engine__, clear__, set__) \ +({ \ + u32 __val; \ +\ + __val = intel_uncore_read((engine__)->uncore, \ + RING_FAULT_REG(engine__)); \ + __val &= ~(clear__); \ + __val |= (set__); \ + intel_uncore_write((engine__)->uncore, RING_FAULT_REG(engine__), \ + __val); \ +}) + +/* seqno size is actually only a uint32, but since we plan to use MI_FLUSH_DW to + * do the writes, and that must have qw aligned offsets, simply pretend it's 8b. + */ +enum intel_engine_hangcheck_action { + ENGINE_IDLE = 0, + ENGINE_WAIT, + ENGINE_ACTIVE_SEQNO, + ENGINE_ACTIVE_HEAD, + ENGINE_ACTIVE_SUBUNITS, + ENGINE_WAIT_KICK, + ENGINE_DEAD, +}; + +static inline const char * +hangcheck_action_to_str(const enum intel_engine_hangcheck_action a) +{ + switch (a) { + case ENGINE_IDLE: + return "idle"; + case ENGINE_WAIT: + return "wait"; + case ENGINE_ACTIVE_SEQNO: + return "active seqno"; + case ENGINE_ACTIVE_HEAD: + return "active head"; + case ENGINE_ACTIVE_SUBUNITS: + return "active subunits"; + case ENGINE_WAIT_KICK: + return "wait kick"; + case ENGINE_DEAD: + return "dead"; + } + + return "unknown"; +} + +void intel_engines_set_scheduler_caps(struct drm_i915_private *i915); + +static inline void +execlists_set_active(struct intel_engine_execlists *execlists, + unsigned int bit) +{ + __set_bit(bit, (unsigned long *)&execlists->active); +} + +static inline bool +execlists_set_active_once(struct intel_engine_execlists *execlists, + unsigned int bit) +{ + return !__test_and_set_bit(bit, (unsigned long *)&execlists->active); +} + +static inline void +execlists_clear_active(struct intel_engine_execlists *execlists, + unsigned int bit) +{ + __clear_bit(bit, (unsigned long *)&execlists->active); +} + +static inline void +execlists_clear_all_active(struct intel_engine_execlists *execlists) +{ + execlists->active = 0; +} + +static inline bool +execlists_is_active(const struct intel_engine_execlists *execlists, + unsigned int bit) +{ + return test_bit(bit, (unsigned long *)&execlists->active); +} + +void execlists_user_begin(struct intel_engine_execlists *execlists, + const struct execlist_port *port); +void execlists_user_end(struct intel_engine_execlists *execlists); + +void +execlists_cancel_port_requests(struct intel_engine_execlists * const execlists); + +struct i915_request * +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists); + +static inline unsigned int +execlists_num_ports(const struct intel_engine_execlists * const execlists) +{ + return execlists->port_mask + 1; +} + +static inline struct execlist_port * +execlists_port_complete(struct intel_engine_execlists * const execlists, + struct execlist_port * const port) +{ + const unsigned int m = execlists->port_mask; + + GEM_BUG_ON(port_index(port, execlists) != 0); + GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER)); + + memmove(port, port + 1, m * sizeof(struct execlist_port)); + memset(port + m, 0, sizeof(struct execlist_port)); + + return port; +} + +static inline u32 +intel_read_status_page(const struct intel_engine_cs *engine, int reg) +{ + /* Ensure that the compiler doesn't optimize away the load. */ + return READ_ONCE(engine->status_page.addr[reg]); +} + +static inline void +intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value) +{ + /* Writing into the status page should be done sparingly. Since + * we do when we are uncertain of the device state, we take a bit + * of extra paranoia to try and ensure that the HWS takes the value + * we give and that it doesn't end up trapped inside the CPU! + */ + if (static_cpu_has(X86_FEATURE_CLFLUSH)) { + mb(); + clflush(&engine->status_page.addr[reg]); + engine->status_page.addr[reg] = value; + clflush(&engine->status_page.addr[reg]); + mb(); + } else { + WRITE_ONCE(engine->status_page.addr[reg], value); + } +} + +/* + * Reads a dword out of the status page, which is written to from the command + * queue by automatic updates, MI_REPORT_HEAD, MI_STORE_DATA_INDEX, or + * MI_STORE_DATA_IMM. + * + * The following dwords have a reserved meaning: + * 0x00: ISR copy, updated when an ISR bit not set in the HWSTAM changes. + * 0x04: ring 0 head pointer + * 0x05: ring 1 head pointer (915-class) + * 0x06: ring 2 head pointer (915-class) + * 0x10-0x1b: Context status DWords (GM45) + * 0x1f: Last written status offset. (GM45) + * 0x20-0x2f: Reserved (Gen6+) + * + * The area from dword 0x30 to 0x3ff is available for driver usage. + */ +#define I915_GEM_HWS_PREEMPT 0x32 +#define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT * sizeof(u32)) +#define I915_GEM_HWS_SEQNO 0x40 +#define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO * sizeof(u32)) +#define I915_GEM_HWS_SCRATCH 0x80 +#define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) + +#define I915_HWS_CSB_BUF0_INDEX 0x10 +#define I915_HWS_CSB_WRITE_INDEX 0x1f +#define CNL_HWS_CSB_WRITE_INDEX 0x2f + +struct intel_ring * +intel_engine_create_ring(struct intel_engine_cs *engine, + struct i915_timeline *timeline, + int size); +int intel_ring_pin(struct intel_ring *ring); +void intel_ring_reset(struct intel_ring *ring, u32 tail); +unsigned int intel_ring_update_space(struct intel_ring *ring); +void intel_ring_unpin(struct intel_ring *ring); +void intel_ring_free(struct kref *ref); + +static inline struct intel_ring *intel_ring_get(struct intel_ring *ring) +{ + kref_get(&ring->ref); + return ring; +} + +static inline void intel_ring_put(struct intel_ring *ring) +{ + kref_put(&ring->ref, intel_ring_free); +} + +void intel_engine_stop(struct intel_engine_cs *engine); +void intel_engine_cleanup(struct intel_engine_cs *engine); + +int __must_check intel_ring_cacheline_align(struct i915_request *rq); + +u32 __must_check *intel_ring_begin(struct i915_request *rq, unsigned int n); + +static inline void intel_ring_advance(struct i915_request *rq, u32 *cs) +{ + /* Dummy function. + * + * This serves as a placeholder in the code so that the reader + * can compare against the preceding intel_ring_begin() and + * check that the number of dwords emitted matches the space + * reserved for the command packet (i.e. the value passed to + * intel_ring_begin()). + */ + GEM_BUG_ON((rq->ring->vaddr + rq->ring->emit) != cs); +} + +static inline u32 intel_ring_wrap(const struct intel_ring *ring, u32 pos) +{ + return pos & (ring->size - 1); +} + +static inline bool +intel_ring_offset_valid(const struct intel_ring *ring, + unsigned int pos) +{ + if (pos & -ring->size) /* must be strictly within the ring */ + return false; + + if (!IS_ALIGNED(pos, 8)) /* must be qword aligned */ + return false; + + return true; +} + +static inline u32 intel_ring_offset(const struct i915_request *rq, void *addr) +{ + /* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */ + u32 offset = addr - rq->ring->vaddr; + GEM_BUG_ON(offset > rq->ring->size); + return intel_ring_wrap(rq->ring, offset); +} + +static inline void +assert_ring_tail_valid(const struct intel_ring *ring, unsigned int tail) +{ + GEM_BUG_ON(!intel_ring_offset_valid(ring, tail)); + + /* + * "Ring Buffer Use" + * Gen2 BSpec "1. Programming Environment" / 1.4.4.6 + * Gen3 BSpec "1c Memory Interface Functions" / 2.3.4.5 + * Gen4+ BSpec "1c Memory Interface and Command Stream" / 5.3.4.5 + * "If the Ring Buffer Head Pointer and the Tail Pointer are on the + * same cacheline, the Head Pointer must not be greater than the Tail + * Pointer." + * + * We use ring->head as the last known location of the actual RING_HEAD, + * it may have advanced but in the worst case it is equally the same + * as ring->head and so we should never program RING_TAIL to advance + * into the same cacheline as ring->head. + */ +#define cacheline(a) round_down(a, CACHELINE_BYTES) + GEM_BUG_ON(cacheline(tail) == cacheline(ring->head) && + tail < ring->head); +#undef cacheline +} + +static inline unsigned int +intel_ring_set_tail(struct intel_ring *ring, unsigned int tail) +{ + /* Whilst writes to the tail are strictly order, there is no + * serialisation between readers and the writers. The tail may be + * read by i915_request_retire() just as it is being updated + * by execlists, as although the breadcrumb is complete, the context + * switch hasn't been seen. + */ + assert_ring_tail_valid(ring, tail); + ring->tail = tail; + return tail; +} + +static inline unsigned int +__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size) +{ + /* + * "If the Ring Buffer Head Pointer and the Tail Pointer are on the + * same cacheline, the Head Pointer must not be greater than the Tail + * Pointer." + */ + GEM_BUG_ON(!is_power_of_2(size)); + return (head - tail - CACHELINE_BYTES) & (size - 1); +} + +int intel_engines_init_mmio(struct drm_i915_private *i915); +int intel_engines_setup(struct drm_i915_private *i915); +int intel_engines_init(struct drm_i915_private *i915); +void intel_engines_cleanup(struct drm_i915_private *i915); + +int intel_engine_init_common(struct intel_engine_cs *engine); +void intel_engine_cleanup_common(struct intel_engine_cs *engine); + +int intel_ring_submission_setup(struct intel_engine_cs *engine); +int intel_ring_submission_init(struct intel_engine_cs *engine); + +int intel_engine_stop_cs(struct intel_engine_cs *engine); +void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine); + +void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask); + +u64 intel_engine_get_active_head(const struct intel_engine_cs *engine); +u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine); + +void intel_engine_get_instdone(struct intel_engine_cs *engine, + struct intel_instdone *instdone); + +void intel_engine_init_execlists(struct intel_engine_cs *engine); + +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine); +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); + +void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine); +void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine); + +void intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine); +void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine); + +static inline void +intel_engine_queue_breadcrumbs(struct intel_engine_cs *engine) +{ + irq_work_queue(&engine->breadcrumbs.irq_work); +} + +void intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine); + +void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine); +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine); + +void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine, + struct drm_printer *p); + +static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset) +{ + memset(batch, 0, 6 * sizeof(u32)); + + batch[0] = GFX_OP_PIPE_CONTROL(6); + batch[1] = flags; + batch[2] = offset; + + return batch + 6; +} + +static inline u32 * +gen8_emit_ggtt_write_rcs(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + /* We're using qword write, offset should be aligned to 8 bytes. */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + /* w/a for post sync ops following a GPGPU operation we + * need a prior CS_STALL, which is emitted by the flush + * following the batch. + */ + *cs++ = GFX_OP_PIPE_CONTROL(6); + *cs++ = flags | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB; + *cs++ = gtt_offset; + *cs++ = 0; + *cs++ = value; + /* We're thrashing one dword of HWS. */ + *cs++ = 0; + + return cs; +} + +static inline u32 * +gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset, u32 flags) +{ + /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ + GEM_BUG_ON(gtt_offset & (1 << 5)); + /* Offset should be aligned to 8 bytes for both (QW/DW) write types */ + GEM_BUG_ON(!IS_ALIGNED(gtt_offset, 8)); + + *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW | flags; + *cs++ = gtt_offset | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = value; + + return cs; +} + +static inline void intel_engine_reset(struct intel_engine_cs *engine, + bool stalled) +{ + if (engine->reset.reset) + engine->reset.reset(engine, stalled); + engine->serial++; /* contexts lost */ +} + +bool intel_engine_is_idle(struct intel_engine_cs *engine); +bool intel_engines_are_idle(struct drm_i915_private *dev_priv); + +void intel_engines_reset_default_submission(struct drm_i915_private *i915); +unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915); + +bool intel_engine_can_store_dword(struct intel_engine_cs *engine); + +__printf(3, 4) +void intel_engine_dump(struct intel_engine_cs *engine, + struct drm_printer *m, + const char *header, ...); + +struct intel_engine_cs * +intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance); + +static inline void intel_engine_context_in(struct intel_engine_cs *engine) +{ + unsigned long flags; + + if (READ_ONCE(engine->stats.enabled) == 0) + return; + + write_seqlock_irqsave(&engine->stats.lock, flags); + + if (engine->stats.enabled > 0) { + if (engine->stats.active++ == 0) + engine->stats.start = ktime_get(); + GEM_BUG_ON(engine->stats.active == 0); + } + + write_sequnlock_irqrestore(&engine->stats.lock, flags); +} + +static inline void intel_engine_context_out(struct intel_engine_cs *engine) +{ + unsigned long flags; + + if (READ_ONCE(engine->stats.enabled) == 0) + return; + + write_seqlock_irqsave(&engine->stats.lock, flags); + + if (engine->stats.enabled > 0) { + ktime_t last; + + if (engine->stats.active && --engine->stats.active == 0) { + /* + * Decrement the active context count and in case GPU + * is now idle add up to the running total. + */ + last = ktime_sub(ktime_get(), engine->stats.start); + + engine->stats.total = ktime_add(engine->stats.total, + last); + } else if (engine->stats.active == 0) { + /* + * After turning on engine stats, context out might be + * the first event in which case we account from the + * time stats gathering was turned on. + */ + last = ktime_sub(ktime_get(), engine->stats.enabled_at); + + engine->stats.total = ktime_add(engine->stats.total, + last); + } + } + + write_sequnlock_irqrestore(&engine->stats.lock, flags); +} + +int intel_enable_engine_stats(struct intel_engine_cs *engine); +void intel_disable_engine_stats(struct intel_engine_cs *engine); + +ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine); + +struct i915_request * +intel_engine_find_active_request(struct intel_engine_cs *engine); + +u32 intel_engine_context_size(struct drm_i915_private *i915, u8 class); + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) + +static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists) +{ + if (!execlists->preempt_hang.inject_hang) + return false; + + complete(&execlists->preempt_hang.completion); + return true; +} + +#else + +static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists) +{ + return false; +} + +#endif + +void intel_engine_init_active(struct intel_engine_cs *engine, + unsigned int subclass); +#define ENGINE_PHYSICAL 0 +#define ENGINE_MOCK 1 +#define ENGINE_VIRTUAL 2 + +#endif /* _INTEL_RINGBUFFER_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c new file mode 100644 index 000000000000..7fd33e81c2d9 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -0,0 +1,1710 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <drm/drm_print.h> + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" + +#include "intel_engine.h" +#include "intel_engine_pm.h" +#include "intel_context.h" +#include "intel_lrc.h" +#include "intel_reset.h" + +/* Haswell does have the CXT_SIZE register however it does not appear to be + * valid. Now, docs explain in dwords what is in the context object. The full + * size is 70720 bytes, however, the power context and execlist context will + * never be saved (power context is stored elsewhere, and execlists don't work + * on HSW) - so the final size, including the extra state required for the + * Resource Streamer, is 66944 bytes, which rounds to 17 pages. + */ +#define HSW_CXT_TOTAL_SIZE (17 * PAGE_SIZE) + +#define DEFAULT_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) +#define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE) +#define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE) +#define GEN10_LR_CONTEXT_RENDER_SIZE (18 * PAGE_SIZE) +#define GEN11_LR_CONTEXT_RENDER_SIZE (14 * PAGE_SIZE) + +#define GEN8_LR_CONTEXT_OTHER_SIZE ( 2 * PAGE_SIZE) + +struct engine_class_info { + const char *name; + u8 uabi_class; +}; + +static const struct engine_class_info intel_engine_classes[] = { + [RENDER_CLASS] = { + .name = "rcs", + .uabi_class = I915_ENGINE_CLASS_RENDER, + }, + [COPY_ENGINE_CLASS] = { + .name = "bcs", + .uabi_class = I915_ENGINE_CLASS_COPY, + }, + [VIDEO_DECODE_CLASS] = { + .name = "vcs", + .uabi_class = I915_ENGINE_CLASS_VIDEO, + }, + [VIDEO_ENHANCEMENT_CLASS] = { + .name = "vecs", + .uabi_class = I915_ENGINE_CLASS_VIDEO_ENHANCE, + }, +}; + +#define MAX_MMIO_BASES 3 +struct engine_info { + unsigned int hw_id; + u8 class; + u8 instance; + /* mmio bases table *must* be sorted in reverse gen order */ + struct engine_mmio_base { + u32 gen : 8; + u32 base : 24; + } mmio_bases[MAX_MMIO_BASES]; +}; + +static const struct engine_info intel_engines[] = { + [RCS0] = { + .hw_id = RCS0_HW, + .class = RENDER_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 1, .base = RENDER_RING_BASE } + }, + }, + [BCS0] = { + .hw_id = BCS0_HW, + .class = COPY_ENGINE_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 6, .base = BLT_RING_BASE } + }, + }, + [VCS0] = { + .hw_id = VCS0_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD_RING_BASE }, + { .gen = 6, .base = GEN6_BSD_RING_BASE }, + { .gen = 4, .base = BSD_RING_BASE } + }, + }, + [VCS1] = { + .hw_id = VCS1_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 1, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD2_RING_BASE }, + { .gen = 8, .base = GEN8_BSD2_RING_BASE } + }, + }, + [VCS2] = { + .hw_id = VCS2_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 2, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD3_RING_BASE } + }, + }, + [VCS3] = { + .hw_id = VCS3_HW, + .class = VIDEO_DECODE_CLASS, + .instance = 3, + .mmio_bases = { + { .gen = 11, .base = GEN11_BSD4_RING_BASE } + }, + }, + [VECS0] = { + .hw_id = VECS0_HW, + .class = VIDEO_ENHANCEMENT_CLASS, + .instance = 0, + .mmio_bases = { + { .gen = 11, .base = GEN11_VEBOX_RING_BASE }, + { .gen = 7, .base = VEBOX_RING_BASE } + }, + }, + [VECS1] = { + .hw_id = VECS1_HW, + .class = VIDEO_ENHANCEMENT_CLASS, + .instance = 1, + .mmio_bases = { + { .gen = 11, .base = GEN11_VEBOX2_RING_BASE } + }, + }, +}; + +/** + * intel_engine_context_size() - return the size of the context for an engine + * @dev_priv: i915 device private + * @class: engine class + * + * Each engine class may require a different amount of space for a context + * image. + * + * Return: size (in bytes) of an engine class specific context image + * + * Note: this size includes the HWSP, which is part of the context image + * in LRC mode, but does not include the "shared data page" used with + * GuC submission. The caller should account for this if using the GuC. + */ +u32 intel_engine_context_size(struct drm_i915_private *dev_priv, u8 class) +{ + u32 cxt_size; + + BUILD_BUG_ON(I915_GTT_PAGE_SIZE != PAGE_SIZE); + + switch (class) { + case RENDER_CLASS: + switch (INTEL_GEN(dev_priv)) { + default: + MISSING_CASE(INTEL_GEN(dev_priv)); + return DEFAULT_LR_CONTEXT_RENDER_SIZE; + case 11: + return GEN11_LR_CONTEXT_RENDER_SIZE; + case 10: + return GEN10_LR_CONTEXT_RENDER_SIZE; + case 9: + return GEN9_LR_CONTEXT_RENDER_SIZE; + case 8: + return GEN8_LR_CONTEXT_RENDER_SIZE; + case 7: + if (IS_HASWELL(dev_priv)) + return HSW_CXT_TOTAL_SIZE; + + cxt_size = I915_READ(GEN7_CXT_SIZE); + return round_up(GEN7_CXT_TOTAL_SIZE(cxt_size) * 64, + PAGE_SIZE); + case 6: + cxt_size = I915_READ(CXT_SIZE); + return round_up(GEN6_CXT_TOTAL_SIZE(cxt_size) * 64, + PAGE_SIZE); + case 5: + case 4: + /* + * There is a discrepancy here between the size reported + * by the register and the size of the context layout + * in the docs. Both are described as authorative! + * + * The discrepancy is on the order of a few cachelines, + * but the total is under one page (4k), which is our + * minimum allocation anyway so it should all come + * out in the wash. + */ + cxt_size = I915_READ(CXT_SIZE) + 1; + DRM_DEBUG_DRIVER("gen%d CXT_SIZE = %d bytes [0x%08x]\n", + INTEL_GEN(dev_priv), + cxt_size * 64, + cxt_size - 1); + return round_up(cxt_size * 64, PAGE_SIZE); + case 3: + case 2: + /* For the special day when i810 gets merged. */ + case 1: + return 0; + } + break; + default: + MISSING_CASE(class); + /* fall through */ + case VIDEO_DECODE_CLASS: + case VIDEO_ENHANCEMENT_CLASS: + case COPY_ENGINE_CLASS: + if (INTEL_GEN(dev_priv) < 8) + return 0; + return GEN8_LR_CONTEXT_OTHER_SIZE; + } +} + +static u32 __engine_mmio_base(struct drm_i915_private *i915, + const struct engine_mmio_base *bases) +{ + int i; + + for (i = 0; i < MAX_MMIO_BASES; i++) + if (INTEL_GEN(i915) >= bases[i].gen) + break; + + GEM_BUG_ON(i == MAX_MMIO_BASES); + GEM_BUG_ON(!bases[i].base); + + return bases[i].base; +} + +static void __sprint_engine_name(char *name, const struct engine_info *info) +{ + WARN_ON(snprintf(name, INTEL_ENGINE_CS_MAX_NAME, "%s%u", + intel_engine_classes[info->class].name, + info->instance) >= INTEL_ENGINE_CS_MAX_NAME); +} + +void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask) +{ + /* + * Though they added more rings on g4x/ilk, they did not add + * per-engine HWSTAM until gen6. + */ + if (INTEL_GEN(engine->i915) < 6 && engine->class != RENDER_CLASS) + return; + + if (INTEL_GEN(engine->i915) >= 3) + ENGINE_WRITE(engine, RING_HWSTAM, mask); + else + ENGINE_WRITE16(engine, RING_HWSTAM, mask); +} + +static void intel_engine_sanitize_mmio(struct intel_engine_cs *engine) +{ + /* Mask off all writes into the unknown HWSP */ + intel_engine_set_hwsp_writemask(engine, ~0u); +} + +static int +intel_engine_setup(struct drm_i915_private *dev_priv, + enum intel_engine_id id) +{ + const struct engine_info *info = &intel_engines[id]; + struct intel_engine_cs *engine; + + GEM_BUG_ON(info->class >= ARRAY_SIZE(intel_engine_classes)); + + BUILD_BUG_ON(MAX_ENGINE_CLASS >= BIT(GEN11_ENGINE_CLASS_WIDTH)); + BUILD_BUG_ON(MAX_ENGINE_INSTANCE >= BIT(GEN11_ENGINE_INSTANCE_WIDTH)); + + if (GEM_DEBUG_WARN_ON(info->class > MAX_ENGINE_CLASS)) + return -EINVAL; + + if (GEM_DEBUG_WARN_ON(info->instance > MAX_ENGINE_INSTANCE)) + return -EINVAL; + + if (GEM_DEBUG_WARN_ON(dev_priv->engine_class[info->class][info->instance])) + return -EINVAL; + + GEM_BUG_ON(dev_priv->engine[id]); + engine = kzalloc(sizeof(*engine), GFP_KERNEL); + if (!engine) + return -ENOMEM; + + BUILD_BUG_ON(BITS_PER_TYPE(engine->mask) < I915_NUM_ENGINES); + + engine->id = id; + engine->mask = BIT(id); + engine->i915 = dev_priv; + engine->uncore = &dev_priv->uncore; + __sprint_engine_name(engine->name, info); + engine->hw_id = engine->guc_id = info->hw_id; + engine->mmio_base = __engine_mmio_base(dev_priv, info->mmio_bases); + engine->class = info->class; + engine->instance = info->instance; + + /* + * To be overridden by the backend on setup. However to facilitate + * cleanup on error during setup, we always provide the destroy vfunc. + */ + engine->destroy = (typeof(engine->destroy))kfree; + + engine->uabi_class = intel_engine_classes[info->class].uabi_class; + + engine->context_size = intel_engine_context_size(dev_priv, + engine->class); + if (WARN_ON(engine->context_size > BIT(20))) + engine->context_size = 0; + if (engine->context_size) + DRIVER_CAPS(dev_priv)->has_logical_contexts = true; + + /* Nothing to do here, execute in order of dependencies */ + engine->schedule = NULL; + + seqlock_init(&engine->stats.lock); + + ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier); + + /* Scrub mmio state on takeover */ + intel_engine_sanitize_mmio(engine); + + dev_priv->engine_class[info->class][info->instance] = engine; + dev_priv->engine[id] = engine; + return 0; +} + +static void __setup_engine_capabilities(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (engine->class == VIDEO_DECODE_CLASS) { + /* + * HEVC support is present on first engine instance + * before Gen11 and on all instances afterwards. + */ + if (INTEL_GEN(i915) >= 11 || + (INTEL_GEN(i915) >= 9 && engine->instance == 0)) + engine->uabi_capabilities |= + I915_VIDEO_CLASS_CAPABILITY_HEVC; + + /* + * SFC block is present only on even logical engine + * instances. + */ + if ((INTEL_GEN(i915) >= 11 && + RUNTIME_INFO(i915)->vdbox_sfc_access & engine->mask) || + (INTEL_GEN(i915) >= 9 && engine->instance == 0)) + engine->uabi_capabilities |= + I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; + } else if (engine->class == VIDEO_ENHANCEMENT_CLASS) { + if (INTEL_GEN(i915) >= 9) + engine->uabi_capabilities |= + I915_VIDEO_AND_ENHANCE_CLASS_CAPABILITY_SFC; + } +} + +static void intel_setup_engine_capabilities(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) + __setup_engine_capabilities(engine); +} + +/** + * intel_engines_cleanup() - free the resources allocated for Command Streamers + * @i915: the i915 devic + */ +void intel_engines_cleanup(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + engine->destroy(engine); + i915->engine[id] = NULL; + } +} + +/** + * intel_engines_init_mmio() - allocate and prepare the Engine Command Streamers + * @i915: the i915 device + * + * Return: non-zero if the initialization failed. + */ +int intel_engines_init_mmio(struct drm_i915_private *i915) +{ + struct intel_device_info *device_info = mkwrite_device_info(i915); + const unsigned int engine_mask = INTEL_INFO(i915)->engine_mask; + unsigned int mask = 0; + unsigned int i; + int err; + + WARN_ON(engine_mask == 0); + WARN_ON(engine_mask & + GENMASK(BITS_PER_TYPE(mask) - 1, I915_NUM_ENGINES)); + + if (i915_inject_load_failure()) + return -ENODEV; + + for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { + if (!HAS_ENGINE(i915, i)) + continue; + + err = intel_engine_setup(i915, i); + if (err) + goto cleanup; + + mask |= BIT(i); + } + + /* + * Catch failures to update intel_engines table when the new engines + * are added to the driver by a warning and disabling the forgotten + * engines. + */ + if (WARN_ON(mask != engine_mask)) + device_info->engine_mask = mask; + + /* We always presume we have at least RCS available for later probing */ + if (WARN_ON(!HAS_ENGINE(i915, RCS0))) { + err = -ENODEV; + goto cleanup; + } + + RUNTIME_INFO(i915)->num_engines = hweight32(mask); + + i915_check_and_clear_faults(i915); + + intel_setup_engine_capabilities(i915); + + return 0; + +cleanup: + intel_engines_cleanup(i915); + return err; +} + +/** + * intel_engines_init() - init the Engine Command Streamers + * @i915: i915 device private + * + * Return: non-zero if the initialization failed. + */ +int intel_engines_init(struct drm_i915_private *i915) +{ + int (*init)(struct intel_engine_cs *engine); + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (HAS_EXECLISTS(i915)) + init = intel_execlists_submission_init; + else + init = intel_ring_submission_init; + + for_each_engine(engine, i915, id) { + err = init(engine); + if (err) + goto cleanup; + } + + return 0; + +cleanup: + intel_engines_cleanup(i915); + return err; +} + +static void intel_engine_init_batch_pool(struct intel_engine_cs *engine) +{ + i915_gem_batch_pool_init(&engine->batch_pool, engine); +} + +void intel_engine_init_execlists(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + execlists->port_mask = 1; + GEM_BUG_ON(!is_power_of_2(execlists_num_ports(execlists))); + GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS); + + execlists->queue_priority_hint = INT_MIN; + execlists->queue = RB_ROOT_CACHED; +} + +static void cleanup_status_page(struct intel_engine_cs *engine) +{ + struct i915_vma *vma; + + /* Prevent writes into HWSP after returning the page to the system */ + intel_engine_set_hwsp_writemask(engine, ~0u); + + vma = fetch_and_zero(&engine->status_page.vma); + if (!vma) + return; + + if (!HWS_NEEDS_PHYSICAL(engine->i915)) + i915_vma_unpin(vma); + + i915_gem_object_unpin_map(vma->obj); + i915_gem_object_put(vma->obj); +} + +static int pin_ggtt_status_page(struct intel_engine_cs *engine, + struct i915_vma *vma) +{ + unsigned int flags; + + flags = PIN_GLOBAL; + if (!HAS_LLC(engine->i915)) + /* + * On g33, we cannot place HWS above 256MiB, so + * restrict its pinning to the low mappable arena. + * Though this restriction is not documented for + * gen4, gen5, or byt, they also behave similarly + * and hang if the HWS is placed at the top of the + * GTT. To generalise, it appears that all !llc + * platforms have issues with us placing the HWS + * above the mappable region (even though we never + * actually map it). + */ + flags |= PIN_MAPPABLE; + else + flags |= PIN_HIGH; + + return i915_vma_pin(vma, 0, 0, flags); +} + +static int init_status_page(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + void *vaddr; + int ret; + + /* + * Though the HWS register does support 36bit addresses, historically + * we have had hangs and corruption reported due to wild writes if + * the HWS is placed above 4G. We only allow objects to be allocated + * in GFP_DMA32 for i965, and no earlier physical address users had + * access to more than 4G. + */ + obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(obj)) { + DRM_ERROR("Failed to allocate status page\n"); + return PTR_ERR(obj); + } + + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); + + vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err; + } + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + ret = PTR_ERR(vaddr); + goto err; + } + + engine->status_page.addr = memset(vaddr, 0, PAGE_SIZE); + engine->status_page.vma = vma; + + if (!HWS_NEEDS_PHYSICAL(engine->i915)) { + ret = pin_ggtt_status_page(engine, vma); + if (ret) + goto err_unpin; + } + + return 0; + +err_unpin: + i915_gem_object_unpin_map(obj); +err: + i915_gem_object_put(obj); + return ret; +} + +static int intel_engine_setup_common(struct intel_engine_cs *engine) +{ + int err; + + init_llist_head(&engine->barrier_tasks); + + err = init_status_page(engine); + if (err) + return err; + + intel_engine_init_active(engine, ENGINE_PHYSICAL); + intel_engine_init_breadcrumbs(engine); + intel_engine_init_execlists(engine); + intel_engine_init_hangcheck(engine); + intel_engine_init_batch_pool(engine); + intel_engine_init_cmd_parser(engine); + intel_engine_init__pm(engine); + + /* Use the whole device by default */ + engine->sseu = + intel_sseu_from_device_info(&RUNTIME_INFO(engine->i915)->sseu); + + return 0; +} + +/** + * intel_engines_setup- setup engine state not requiring hw access + * @i915: Device to setup. + * + * Initializes engine structure members shared between legacy and execlists + * submission modes which do not require hardware access. + * + * Typically done early in the submission mode specific engine setup stage. + */ +int intel_engines_setup(struct drm_i915_private *i915) +{ + int (*setup)(struct intel_engine_cs *engine); + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + if (HAS_EXECLISTS(i915)) + setup = intel_execlists_submission_setup; + else + setup = intel_ring_submission_setup; + + for_each_engine(engine, i915, id) { + err = intel_engine_setup_common(engine); + if (err) + goto cleanup; + + err = setup(engine); + if (err) + goto cleanup; + + /* We expect the backend to take control over its state */ + GEM_BUG_ON(engine->destroy == (typeof(engine->destroy))kfree); + + GEM_BUG_ON(!engine->cops); + } + + return 0; + +cleanup: + intel_engines_cleanup(i915); + return err; +} + +void intel_engines_set_scheduler_caps(struct drm_i915_private *i915) +{ + static const struct { + u8 engine; + u8 sched; + } map[] = { +#define MAP(x, y) { ilog2(I915_ENGINE_HAS_##x), ilog2(I915_SCHEDULER_CAP_##y) } + MAP(PREEMPTION, PREEMPTION), + MAP(SEMAPHORES, SEMAPHORES), +#undef MAP + }; + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 enabled, disabled; + + enabled = 0; + disabled = 0; + for_each_engine(engine, i915, id) { /* all engines must agree! */ + int i; + + if (engine->schedule) + enabled |= (I915_SCHEDULER_CAP_ENABLED | + I915_SCHEDULER_CAP_PRIORITY); + else + disabled |= (I915_SCHEDULER_CAP_ENABLED | + I915_SCHEDULER_CAP_PRIORITY); + + for (i = 0; i < ARRAY_SIZE(map); i++) { + if (engine->flags & BIT(map[i].engine)) + enabled |= BIT(map[i].sched); + else + disabled |= BIT(map[i].sched); + } + } + + i915->caps.scheduler = enabled & ~disabled; + if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_ENABLED)) + i915->caps.scheduler = 0; +} + +struct measure_breadcrumb { + struct i915_request rq; + struct i915_timeline timeline; + struct intel_ring ring; + u32 cs[1024]; +}; + +static int measure_breadcrumb_dw(struct intel_engine_cs *engine) +{ + struct measure_breadcrumb *frame; + int dw = -ENOMEM; + + GEM_BUG_ON(!engine->i915->gt.scratch); + + frame = kzalloc(sizeof(*frame), GFP_KERNEL); + if (!frame) + return -ENOMEM; + + if (i915_timeline_init(engine->i915, + &frame->timeline, + engine->status_page.vma)) + goto out_frame; + + INIT_LIST_HEAD(&frame->ring.request_list); + frame->ring.timeline = &frame->timeline; + frame->ring.vaddr = frame->cs; + frame->ring.size = sizeof(frame->cs); + frame->ring.effective_size = frame->ring.size; + intel_ring_update_space(&frame->ring); + + frame->rq.i915 = engine->i915; + frame->rq.engine = engine; + frame->rq.ring = &frame->ring; + frame->rq.timeline = &frame->timeline; + + dw = i915_timeline_pin(&frame->timeline); + if (dw < 0) + goto out_timeline; + + dw = engine->emit_fini_breadcrumb(&frame->rq, frame->cs) - frame->cs; + GEM_BUG_ON(dw & 1); /* RING_TAIL must be qword aligned */ + + i915_timeline_unpin(&frame->timeline); + +out_timeline: + i915_timeline_fini(&frame->timeline); +out_frame: + kfree(frame); + return dw; +} + +static int pin_context(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + struct intel_context **out) +{ + struct intel_context *ce; + int err; + + ce = i915_gem_context_get_engine(ctx, engine->id); + if (IS_ERR(ce)) + return PTR_ERR(ce); + + err = intel_context_pin(ce); + intel_context_put(ce); + if (err) + return err; + + *out = ce; + return 0; +} + +void +intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass) +{ + INIT_LIST_HEAD(&engine->active.requests); + + spin_lock_init(&engine->active.lock); + lockdep_set_subclass(&engine->active.lock, subclass); + + /* + * Due to an interesting quirk in lockdep's internal debug tracking, + * after setting a subclass we must ensure the lock is used. Otherwise, + * nr_unused_locks is incremented once too often. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + local_irq_disable(); + lock_map_acquire(&engine->active.lock.dep_map); + lock_map_release(&engine->active.lock.dep_map); + local_irq_enable(); +#endif +} + +/** + * intel_engines_init_common - initialize cengine state which might require hw access + * @engine: Engine to initialize. + * + * Initializes @engine@ structure members shared between legacy and execlists + * submission modes which do require hardware access. + * + * Typcally done at later stages of submission mode specific engine setup. + * + * Returns zero on success or an error code on failure. + */ +int intel_engine_init_common(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + int ret; + + /* We may need to do things with the shrinker which + * require us to immediately switch back to the default + * context. This can cause a problem as pinning the + * default context also requires GTT space which may not + * be available. To avoid this we always pin the default + * context. + */ + ret = pin_context(i915->kernel_context, engine, + &engine->kernel_context); + if (ret) + return ret; + + /* + * Similarly the preempt context must always be available so that + * we can interrupt the engine at any time. However, as preemption + * is optional, we allow it to fail. + */ + if (i915->preempt_context) + pin_context(i915->preempt_context, engine, + &engine->preempt_context); + + ret = measure_breadcrumb_dw(engine); + if (ret < 0) + goto err_unpin; + + engine->emit_fini_breadcrumb_dw = ret; + + engine->set_default_submission(engine); + + return 0; + +err_unpin: + if (engine->preempt_context) + intel_context_unpin(engine->preempt_context); + intel_context_unpin(engine->kernel_context); + return ret; +} + +/** + * intel_engines_cleanup_common - cleans up the engine state created by + * the common initiailizers. + * @engine: Engine to cleanup. + * + * This cleans up everything created by the common helpers. + */ +void intel_engine_cleanup_common(struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!list_empty(&engine->active.requests)); + + cleanup_status_page(engine); + + intel_engine_fini_breadcrumbs(engine); + intel_engine_cleanup_cmd_parser(engine); + i915_gem_batch_pool_fini(&engine->batch_pool); + + if (engine->default_state) + i915_gem_object_put(engine->default_state); + + if (engine->preempt_context) + intel_context_unpin(engine->preempt_context); + intel_context_unpin(engine->kernel_context); + GEM_BUG_ON(!llist_empty(&engine->barrier_tasks)); + + intel_wa_list_free(&engine->ctx_wa_list); + intel_wa_list_free(&engine->wa_list); + intel_wa_list_free(&engine->whitelist); +} + +u64 intel_engine_get_active_head(const struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + u64 acthd; + + if (INTEL_GEN(i915) >= 8) + acthd = ENGINE_READ64(engine, RING_ACTHD, RING_ACTHD_UDW); + else if (INTEL_GEN(i915) >= 4) + acthd = ENGINE_READ(engine, RING_ACTHD); + else + acthd = ENGINE_READ(engine, ACTHD); + + return acthd; +} + +u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine) +{ + u64 bbaddr; + + if (INTEL_GEN(engine->i915) >= 8) + bbaddr = ENGINE_READ64(engine, RING_BBADDR, RING_BBADDR_UDW); + else + bbaddr = ENGINE_READ(engine, RING_BBADDR); + + return bbaddr; +} + +int intel_engine_stop_cs(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + const i915_reg_t mode = RING_MI_MODE(base); + int err; + + if (INTEL_GEN(engine->i915) < 3) + return -ENODEV; + + GEM_TRACE("%s\n", engine->name); + + intel_uncore_write_fw(uncore, mode, _MASKED_BIT_ENABLE(STOP_RING)); + + err = 0; + if (__intel_wait_for_register_fw(uncore, + mode, MODE_IDLE, MODE_IDLE, + 1000, 0, + NULL)) { + GEM_TRACE("%s: timed out on STOP_RING -> IDLE\n", engine->name); + err = -ETIMEDOUT; + } + + /* A final mmio read to let GPU writes be hopefully flushed to memory */ + intel_uncore_posting_read_fw(uncore, mode); + + return err; +} + +void intel_engine_cancel_stop_cs(struct intel_engine_cs *engine) +{ + GEM_TRACE("%s\n", engine->name); + + ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); +} + +const char *i915_cache_level_str(struct drm_i915_private *i915, int type) +{ + switch (type) { + case I915_CACHE_NONE: return " uncached"; + case I915_CACHE_LLC: return HAS_LLC(i915) ? " LLC" : " snooped"; + case I915_CACHE_L3_LLC: return " L3+LLC"; + case I915_CACHE_WT: return " WT"; + default: return ""; + } +} + +u32 intel_calculate_mcr_s_ss_select(struct drm_i915_private *dev_priv) +{ + const struct sseu_dev_info *sseu = &RUNTIME_INFO(dev_priv)->sseu; + u32 mcr_s_ss_select; + u32 slice = fls(sseu->slice_mask); + u32 subslice = fls(sseu->subslice_mask[slice]); + + if (IS_GEN(dev_priv, 10)) + mcr_s_ss_select = GEN8_MCR_SLICE(slice) | + GEN8_MCR_SUBSLICE(subslice); + else if (INTEL_GEN(dev_priv) >= 11) + mcr_s_ss_select = GEN11_MCR_SLICE(slice) | + GEN11_MCR_SUBSLICE(subslice); + else + mcr_s_ss_select = 0; + + return mcr_s_ss_select; +} + +static u32 +read_subslice_reg(struct intel_engine_cs *engine, int slice, int subslice, + i915_reg_t reg) +{ + struct drm_i915_private *i915 = engine->i915; + struct intel_uncore *uncore = engine->uncore; + u32 mcr_slice_subslice_mask; + u32 mcr_slice_subslice_select; + u32 default_mcr_s_ss_select; + u32 mcr; + u32 ret; + enum forcewake_domains fw_domains; + + if (INTEL_GEN(i915) >= 11) { + mcr_slice_subslice_mask = GEN11_MCR_SLICE_MASK | + GEN11_MCR_SUBSLICE_MASK; + mcr_slice_subslice_select = GEN11_MCR_SLICE(slice) | + GEN11_MCR_SUBSLICE(subslice); + } else { + mcr_slice_subslice_mask = GEN8_MCR_SLICE_MASK | + GEN8_MCR_SUBSLICE_MASK; + mcr_slice_subslice_select = GEN8_MCR_SLICE(slice) | + GEN8_MCR_SUBSLICE(subslice); + } + + default_mcr_s_ss_select = intel_calculate_mcr_s_ss_select(i915); + + fw_domains = intel_uncore_forcewake_for_reg(uncore, reg, + FW_REG_READ); + fw_domains |= intel_uncore_forcewake_for_reg(uncore, + GEN8_MCR_SELECTOR, + FW_REG_READ | FW_REG_WRITE); + + spin_lock_irq(&uncore->lock); + intel_uncore_forcewake_get__locked(uncore, fw_domains); + + mcr = intel_uncore_read_fw(uncore, GEN8_MCR_SELECTOR); + + WARN_ON_ONCE((mcr & mcr_slice_subslice_mask) != + default_mcr_s_ss_select); + + mcr &= ~mcr_slice_subslice_mask; + mcr |= mcr_slice_subslice_select; + intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr); + + ret = intel_uncore_read_fw(uncore, reg); + + mcr &= ~mcr_slice_subslice_mask; + mcr |= default_mcr_s_ss_select; + + intel_uncore_write_fw(uncore, GEN8_MCR_SELECTOR, mcr); + + intel_uncore_forcewake_put__locked(uncore, fw_domains); + spin_unlock_irq(&uncore->lock); + + return ret; +} + +/* NB: please notice the memset */ +void intel_engine_get_instdone(struct intel_engine_cs *engine, + struct intel_instdone *instdone) +{ + struct drm_i915_private *i915 = engine->i915; + struct intel_uncore *uncore = engine->uncore; + u32 mmio_base = engine->mmio_base; + int slice; + int subslice; + + memset(instdone, 0, sizeof(*instdone)); + + switch (INTEL_GEN(i915)) { + default: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + + if (engine->id != RCS0) + break; + + instdone->slice_common = + intel_uncore_read(uncore, GEN7_SC_INSTDONE); + for_each_instdone_slice_subslice(i915, slice, subslice) { + instdone->sampler[slice][subslice] = + read_subslice_reg(engine, slice, subslice, + GEN7_SAMPLER_INSTDONE); + instdone->row[slice][subslice] = + read_subslice_reg(engine, slice, subslice, + GEN7_ROW_INSTDONE); + } + break; + case 7: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + + if (engine->id != RCS0) + break; + + instdone->slice_common = + intel_uncore_read(uncore, GEN7_SC_INSTDONE); + instdone->sampler[0][0] = + intel_uncore_read(uncore, GEN7_SAMPLER_INSTDONE); + instdone->row[0][0] = + intel_uncore_read(uncore, GEN7_ROW_INSTDONE); + + break; + case 6: + case 5: + case 4: + instdone->instdone = + intel_uncore_read(uncore, RING_INSTDONE(mmio_base)); + if (engine->id == RCS0) + /* HACK: Using the wrong struct member */ + instdone->slice_common = + intel_uncore_read(uncore, GEN4_INSTDONE1); + break; + case 3: + case 2: + instdone->instdone = intel_uncore_read(uncore, GEN2_INSTDONE); + break; + } +} + +static bool ring_is_idle(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + intel_wakeref_t wakeref; + bool idle = true; + + if (I915_SELFTEST_ONLY(!engine->mmio_base)) + return true; + + /* If the whole device is asleep, the engine must be idle */ + wakeref = intel_runtime_pm_get_if_in_use(&dev_priv->runtime_pm); + if (!wakeref) + return true; + + /* First check that no commands are left in the ring */ + if ((ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) != + (ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR)) + idle = false; + + /* No bit for gen2, so assume the CS parser is idle */ + if (INTEL_GEN(dev_priv) > 2 && + !(ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE)) + idle = false; + + intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref); + + return idle; +} + +/** + * intel_engine_is_idle() - Report if the engine has finished process all work + * @engine: the intel_engine_cs + * + * Return true if there are no requests pending, nothing left to be submitted + * to hardware, and that the engine is idle. + */ +bool intel_engine_is_idle(struct intel_engine_cs *engine) +{ + /* More white lies, if wedged, hw state is inconsistent */ + if (i915_reset_failed(engine->i915)) + return true; + + if (!intel_wakeref_active(&engine->wakeref)) + return true; + + /* Waiting to drain ELSP? */ + if (READ_ONCE(engine->execlists.active)) { + struct tasklet_struct *t = &engine->execlists.tasklet; + + synchronize_hardirq(engine->i915->drm.irq); + + local_bh_disable(); + if (tasklet_trylock(t)) { + /* Must wait for any GPU reset in progress. */ + if (__tasklet_is_enabled(t)) + t->func(t->data); + tasklet_unlock(t); + } + local_bh_enable(); + + /* Otherwise flush the tasklet if it was on another cpu */ + tasklet_unlock_wait(t); + + if (READ_ONCE(engine->execlists.active)) + return false; + } + + /* ELSP is empty, but there are ready requests? E.g. after reset */ + if (!RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)) + return false; + + /* Ring stopped? */ + return ring_is_idle(engine); +} + +bool intel_engines_are_idle(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * If the driver is wedged, HW state may be very inconsistent and + * report that it is still busy, even though we have stopped using it. + */ + if (i915_reset_failed(i915)) + return true; + + /* Already parked (and passed an idleness test); must still be idle */ + if (!READ_ONCE(i915->gt.awake)) + return true; + + for_each_engine(engine, i915, id) { + if (!intel_engine_is_idle(engine)) + return false; + } + + return true; +} + +void intel_engines_reset_default_submission(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) + engine->set_default_submission(engine); +} + +bool intel_engine_can_store_dword(struct intel_engine_cs *engine) +{ + switch (INTEL_GEN(engine->i915)) { + case 2: + return false; /* uses physical not virtual addresses */ + case 3: + /* maybe only uses physical not virtual addresses */ + return !(IS_I915G(engine->i915) || IS_I915GM(engine->i915)); + case 6: + return engine->class != VIDEO_DECODE_CLASS; /* b0rked */ + default: + return true; + } +} + +unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned int which; + + which = 0; + for_each_engine(engine, i915, id) + if (engine->default_state) + which |= BIT(engine->uabi_class); + + return which; +} + +static int print_sched_attr(struct drm_i915_private *i915, + const struct i915_sched_attr *attr, + char *buf, int x, int len) +{ + if (attr->priority == I915_PRIORITY_INVALID) + return x; + + x += snprintf(buf + x, len - x, + " prio=%d", attr->priority); + + return x; +} + +static void print_request(struct drm_printer *m, + struct i915_request *rq, + const char *prefix) +{ + const char *name = rq->fence.ops->get_timeline_name(&rq->fence); + char buf[80] = ""; + int x = 0; + + x = print_sched_attr(rq->i915, &rq->sched.attr, buf, x, sizeof(buf)); + + drm_printf(m, "%s %llx:%llx%s%s %s @ %dms: %s\n", + prefix, + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &rq->fence.flags) ? "+" : + test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, + &rq->fence.flags) ? "-" : + "", + buf, + jiffies_to_msecs(jiffies - rq->emitted_jiffies), + name); +} + +static void hexdump(struct drm_printer *m, const void *buf, size_t len) +{ + const size_t rowsize = 8 * sizeof(u32); + const void *prev = NULL; + bool skip = false; + size_t pos; + + for (pos = 0; pos < len; pos += rowsize) { + char line[128]; + + if (prev && !memcmp(prev, buf + pos, rowsize)) { + if (!skip) { + drm_printf(m, "*\n"); + skip = true; + } + continue; + } + + WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos, + rowsize, sizeof(u32), + line, sizeof(line), + false) >= sizeof(line)); + drm_printf(m, "[%04zx] %s\n", pos, line); + + prev = buf + pos; + skip = false; + } +} + +static void intel_engine_print_registers(struct intel_engine_cs *engine, + struct drm_printer *m) +{ + struct drm_i915_private *dev_priv = engine->i915; + const struct intel_engine_execlists * const execlists = + &engine->execlists; + unsigned long flags; + u64 addr; + + if (engine->id == RCS0 && IS_GEN_RANGE(dev_priv, 4, 7)) + drm_printf(m, "\tCCID: 0x%08x\n", ENGINE_READ(engine, CCID)); + drm_printf(m, "\tRING_START: 0x%08x\n", + ENGINE_READ(engine, RING_START)); + drm_printf(m, "\tRING_HEAD: 0x%08x\n", + ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR); + drm_printf(m, "\tRING_TAIL: 0x%08x\n", + ENGINE_READ(engine, RING_TAIL) & TAIL_ADDR); + drm_printf(m, "\tRING_CTL: 0x%08x%s\n", + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_CTL) & (RING_WAIT | RING_WAIT_SEMAPHORE) ? " [waiting]" : ""); + if (INTEL_GEN(engine->i915) > 2) { + drm_printf(m, "\tRING_MODE: 0x%08x%s\n", + ENGINE_READ(engine, RING_MI_MODE), + ENGINE_READ(engine, RING_MI_MODE) & (MODE_IDLE) ? " [idle]" : ""); + } + + if (INTEL_GEN(dev_priv) >= 6) { + drm_printf(m, "\tRING_IMR: %08x\n", + ENGINE_READ(engine, RING_IMR)); + } + + addr = intel_engine_get_active_head(engine); + drm_printf(m, "\tACTHD: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + addr = intel_engine_get_last_batch_head(engine); + drm_printf(m, "\tBBADDR: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + if (INTEL_GEN(dev_priv) >= 8) + addr = ENGINE_READ64(engine, RING_DMA_FADD, RING_DMA_FADD_UDW); + else if (INTEL_GEN(dev_priv) >= 4) + addr = ENGINE_READ(engine, RING_DMA_FADD); + else + addr = ENGINE_READ(engine, DMA_FADD_I8XX); + drm_printf(m, "\tDMA_FADDR: 0x%08x_%08x\n", + upper_32_bits(addr), lower_32_bits(addr)); + if (INTEL_GEN(dev_priv) >= 4) { + drm_printf(m, "\tIPEIR: 0x%08x\n", + ENGINE_READ(engine, RING_IPEIR)); + drm_printf(m, "\tIPEHR: 0x%08x\n", + ENGINE_READ(engine, RING_IPEHR)); + } else { + drm_printf(m, "\tIPEIR: 0x%08x\n", ENGINE_READ(engine, IPEIR)); + drm_printf(m, "\tIPEHR: 0x%08x\n", ENGINE_READ(engine, IPEHR)); + } + + if (HAS_EXECLISTS(dev_priv)) { + const u32 *hws = + &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + const u8 num_entries = execlists->csb_size; + unsigned int idx; + u8 read, write; + + drm_printf(m, "\tExeclist status: 0x%08x %08x, entries %u\n", + ENGINE_READ(engine, RING_EXECLIST_STATUS_LO), + ENGINE_READ(engine, RING_EXECLIST_STATUS_HI), + num_entries); + + read = execlists->csb_head; + write = READ_ONCE(*execlists->csb_write); + + drm_printf(m, "\tExeclist CSB read %d, write %d, tasklet queued? %s (%s)\n", + read, write, + yesno(test_bit(TASKLET_STATE_SCHED, + &engine->execlists.tasklet.state)), + enableddisabled(!atomic_read(&engine->execlists.tasklet.count))); + if (read >= num_entries) + read = 0; + if (write >= num_entries) + write = 0; + if (read > write) + write += num_entries; + while (read < write) { + idx = ++read % num_entries; + drm_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", + idx, hws[idx * 2], hws[idx * 2 + 1]); + } + + spin_lock_irqsave(&engine->active.lock, flags); + for (idx = 0; idx < execlists_num_ports(execlists); idx++) { + struct i915_request *rq; + unsigned int count; + char hdr[80]; + + rq = port_unpack(&execlists->port[idx], &count); + if (!rq) { + drm_printf(m, "\t\tELSP[%d] idle\n", idx); + } else if (!i915_request_signaled(rq)) { + snprintf(hdr, sizeof(hdr), + "\t\tELSP[%d] count=%d, ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ", + idx, count, + i915_ggtt_offset(rq->ring->vma), + rq->timeline->hwsp_offset, + hwsp_seqno(rq)); + print_request(m, rq, hdr); + } else { + print_request(m, rq, "\t\tELSP[%d] rq: "); + } + } + drm_printf(m, "\t\tHW active? 0x%x\n", execlists->active); + spin_unlock_irqrestore(&engine->active.lock, flags); + } else if (INTEL_GEN(dev_priv) > 6) { + drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_BASE)); + drm_printf(m, "\tPP_DIR_BASE_READ: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_BASE_READ)); + drm_printf(m, "\tPP_DIR_DCLV: 0x%08x\n", + ENGINE_READ(engine, RING_PP_DIR_DCLV)); + } +} + +static void print_request_ring(struct drm_printer *m, struct i915_request *rq) +{ + void *ring; + int size; + + drm_printf(m, + "[head %04x, postfix %04x, tail %04x, batch 0x%08x_%08x]:\n", + rq->head, rq->postfix, rq->tail, + rq->batch ? upper_32_bits(rq->batch->node.start) : ~0u, + rq->batch ? lower_32_bits(rq->batch->node.start) : ~0u); + + size = rq->tail - rq->head; + if (rq->tail < rq->head) + size += rq->ring->size; + + ring = kmalloc(size, GFP_ATOMIC); + if (ring) { + const void *vaddr = rq->ring->vaddr; + unsigned int head = rq->head; + unsigned int len = 0; + + if (rq->tail < head) { + len = rq->ring->size - head; + memcpy(ring, vaddr + head, len); + head = 0; + } + memcpy(ring + len, vaddr + head, size - len); + + hexdump(m, ring, size); + kfree(ring); + } +} + +void intel_engine_dump(struct intel_engine_cs *engine, + struct drm_printer *m, + const char *header, ...) +{ + struct i915_gpu_error * const error = &engine->i915->gpu_error; + struct i915_request *rq; + intel_wakeref_t wakeref; + + if (header) { + va_list ap; + + va_start(ap, header); + drm_vprintf(m, header, &ap); + va_end(ap); + } + + if (i915_reset_failed(engine->i915)) + drm_printf(m, "*** WEDGED ***\n"); + + drm_printf(m, "\tAwake? %d\n", atomic_read(&engine->wakeref.count)); + drm_printf(m, "\tHangcheck: %d ms ago\n", + jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp)); + drm_printf(m, "\tReset count: %d (global %d)\n", + i915_reset_engine_count(error, engine), + i915_reset_count(error)); + + rcu_read_lock(); + + drm_printf(m, "\tRequests:\n"); + + rq = intel_engine_find_active_request(engine); + if (rq) { + print_request(m, rq, "\t\tactive "); + + drm_printf(m, "\t\tring->start: 0x%08x\n", + i915_ggtt_offset(rq->ring->vma)); + drm_printf(m, "\t\tring->head: 0x%08x\n", + rq->ring->head); + drm_printf(m, "\t\tring->tail: 0x%08x\n", + rq->ring->tail); + drm_printf(m, "\t\tring->emit: 0x%08x\n", + rq->ring->emit); + drm_printf(m, "\t\tring->space: 0x%08x\n", + rq->ring->space); + drm_printf(m, "\t\tring->hwsp: 0x%08x\n", + rq->timeline->hwsp_offset); + + print_request_ring(m, rq); + } + + rcu_read_unlock(); + + wakeref = intel_runtime_pm_get_if_in_use(&engine->i915->runtime_pm); + if (wakeref) { + intel_engine_print_registers(engine, m); + intel_runtime_pm_put(&engine->i915->runtime_pm, wakeref); + } else { + drm_printf(m, "\tDevice is asleep; skipping register dump\n"); + } + + intel_execlists_show_requests(engine, m, print_request, 8); + + drm_printf(m, "HWSP:\n"); + hexdump(m, engine->status_page.addr, PAGE_SIZE); + + drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine))); + + intel_engine_print_breadcrumbs(engine, m); +} + +static u8 user_class_map[] = { + [I915_ENGINE_CLASS_RENDER] = RENDER_CLASS, + [I915_ENGINE_CLASS_COPY] = COPY_ENGINE_CLASS, + [I915_ENGINE_CLASS_VIDEO] = VIDEO_DECODE_CLASS, + [I915_ENGINE_CLASS_VIDEO_ENHANCE] = VIDEO_ENHANCEMENT_CLASS, +}; + +struct intel_engine_cs * +intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance) +{ + if (class >= ARRAY_SIZE(user_class_map)) + return NULL; + + class = user_class_map[class]; + + GEM_BUG_ON(class > MAX_ENGINE_CLASS); + + if (instance > MAX_ENGINE_INSTANCE) + return NULL; + + return i915->engine_class[class][instance]; +} + +/** + * intel_enable_engine_stats() - Enable engine busy tracking on engine + * @engine: engine to enable stats collection + * + * Start collecting the engine busyness data for @engine. + * + * Returns 0 on success or a negative error code. + */ +int intel_enable_engine_stats(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + unsigned long flags; + int err = 0; + + if (!intel_engine_supports_stats(engine)) + return -ENODEV; + + spin_lock_irqsave(&engine->active.lock, flags); + write_seqlock(&engine->stats.lock); + + if (unlikely(engine->stats.enabled == ~0)) { + err = -EBUSY; + goto unlock; + } + + if (engine->stats.enabled++ == 0) { + const struct execlist_port *port = execlists->port; + unsigned int num_ports = execlists_num_ports(execlists); + + engine->stats.enabled_at = ktime_get(); + + /* XXX submission method oblivious? */ + while (num_ports-- && port_isset(port)) { + engine->stats.active++; + port++; + } + + if (engine->stats.active) + engine->stats.start = engine->stats.enabled_at; + } + +unlock: + write_sequnlock(&engine->stats.lock); + spin_unlock_irqrestore(&engine->active.lock, flags); + + return err; +} + +static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine) +{ + ktime_t total = engine->stats.total; + + /* + * If the engine is executing something at the moment + * add it to the total. + */ + if (engine->stats.active) + total = ktime_add(total, + ktime_sub(ktime_get(), engine->stats.start)); + + return total; +} + +/** + * intel_engine_get_busy_time() - Return current accumulated engine busyness + * @engine: engine to report on + * + * Returns accumulated time @engine was busy since engine stats were enabled. + */ +ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine) +{ + unsigned int seq; + ktime_t total; + + do { + seq = read_seqbegin(&engine->stats.lock); + total = __intel_engine_get_busy_time(engine); + } while (read_seqretry(&engine->stats.lock, seq)); + + return total; +} + +/** + * intel_disable_engine_stats() - Disable engine busy tracking on engine + * @engine: engine to disable stats collection + * + * Stops collecting the engine busyness data for @engine. + */ +void intel_disable_engine_stats(struct intel_engine_cs *engine) +{ + unsigned long flags; + + if (!intel_engine_supports_stats(engine)) + return; + + write_seqlock_irqsave(&engine->stats.lock, flags); + WARN_ON_ONCE(engine->stats.enabled == 0); + if (--engine->stats.enabled == 0) { + engine->stats.total = __intel_engine_get_busy_time(engine); + engine->stats.active = 0; + } + write_sequnlock_irqrestore(&engine->stats.lock, flags); +} + +static bool match_ring(struct i915_request *rq) +{ + u32 ring = ENGINE_READ(rq->engine, RING_START); + + return ring == i915_ggtt_offset(rq->ring->vma); +} + +struct i915_request * +intel_engine_find_active_request(struct intel_engine_cs *engine) +{ + struct i915_request *request, *active = NULL; + unsigned long flags; + + /* + * We are called by the error capture, reset and to dump engine + * state at random points in time. In particular, note that neither is + * crucially ordered with an interrupt. After a hang, the GPU is dead + * and we assume that no more writes can happen (we waited long enough + * for all writes that were in transaction to be flushed) - adding an + * extra delay for a recent interrupt is pointless. Hence, we do + * not need an engine->irq_seqno_barrier() before the seqno reads. + * At all other times, we must assume the GPU is still running, but + * we only care about the snapshot of this moment. + */ + spin_lock_irqsave(&engine->active.lock, flags); + list_for_each_entry(request, &engine->active.requests, sched.link) { + if (i915_request_completed(request)) + continue; + + if (!i915_request_started(request)) + continue; + + /* More than one preemptible request may match! */ + if (!match_ring(request)) + continue; + + active = request; + break; + } + spin_unlock_irqrestore(&engine->active.lock, flags); + + return active; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_engine_cs.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c new file mode 100644 index 000000000000..2ce00d3dc42a --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -0,0 +1,168 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" + +#include "intel_engine.h" +#include "intel_engine_pm.h" +#include "intel_gt_pm.h" + +static int __engine_unpark(struct intel_wakeref *wf) +{ + struct intel_engine_cs *engine = + container_of(wf, typeof(*engine), wakeref); + void *map; + + GEM_TRACE("%s\n", engine->name); + + intel_gt_pm_get(engine->i915); + + /* Pin the default state for fast resets from atomic context. */ + map = NULL; + if (engine->default_state) + map = i915_gem_object_pin_map(engine->default_state, + I915_MAP_WB); + if (!IS_ERR_OR_NULL(map)) + engine->pinned_default_state = map; + + if (engine->unpark) + engine->unpark(engine); + + intel_engine_init_hangcheck(engine); + return 0; +} + +void intel_engine_pm_get(struct intel_engine_cs *engine) +{ + intel_wakeref_get(&engine->i915->runtime_pm, &engine->wakeref, __engine_unpark); +} + +void intel_engine_park(struct intel_engine_cs *engine) +{ + /* + * We are committed now to parking this engine, make sure there + * will be no more interrupts arriving later and the engine + * is truly idle. + */ + if (wait_for(intel_engine_is_idle(engine), 10)) { + struct drm_printer p = drm_debug_printer(__func__); + + dev_err(engine->i915->drm.dev, + "%s is not idle before parking\n", + engine->name); + intel_engine_dump(engine, &p, NULL); + } +} + +static bool switch_to_kernel_context(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + /* Already inside the kernel context, safe to power down. */ + if (engine->wakeref_serial == engine->serial) + return true; + + /* GPU is pointing to the void, as good as in the kernel context. */ + if (i915_reset_failed(engine->i915)) + return true; + + /* + * Note, we do this without taking the timeline->mutex. We cannot + * as we may be called while retiring the kernel context and so + * already underneath the timeline->mutex. Instead we rely on the + * exclusive property of the __engine_park that prevents anyone + * else from creating a request on this engine. This also requires + * that the ring is empty and we avoid any waits while constructing + * the context, as they assume protection by the timeline->mutex. + * This should hold true as we can only park the engine after + * retiring the last request, thus all rings should be empty and + * all timelines idle. + */ + rq = __i915_request_create(engine->kernel_context, GFP_NOWAIT); + if (IS_ERR(rq)) + /* Context switch failed, hope for the best! Maybe reset? */ + return true; + + /* Check again on the next retirement. */ + engine->wakeref_serial = engine->serial + 1; + + i915_request_add_barriers(rq); + __i915_request_commit(rq); + + return false; +} + +static int __engine_park(struct intel_wakeref *wf) +{ + struct intel_engine_cs *engine = + container_of(wf, typeof(*engine), wakeref); + + engine->saturated = 0; + + /* + * If one and only one request is completed between pm events, + * we know that we are inside the kernel context and it is + * safe to power down. (We are paranoid in case that runtime + * suspend causes corruption to the active context image, and + * want to avoid that impacting userspace.) + */ + if (!switch_to_kernel_context(engine)) + return -EBUSY; + + GEM_TRACE("%s\n", engine->name); + + intel_engine_disarm_breadcrumbs(engine); + + /* Must be reset upon idling, or we may miss the busy wakeup. */ + GEM_BUG_ON(engine->execlists.queue_priority_hint != INT_MIN); + + if (engine->park) + engine->park(engine); + + if (engine->pinned_default_state) { + i915_gem_object_unpin_map(engine->default_state); + engine->pinned_default_state = NULL; + } + + engine->execlists.no_priolist = false; + + intel_gt_pm_put(engine->i915); + return 0; +} + +void intel_engine_pm_put(struct intel_engine_cs *engine) +{ + intel_wakeref_put(&engine->i915->runtime_pm, &engine->wakeref, __engine_park); +} + +void intel_engine_init__pm(struct intel_engine_cs *engine) +{ + intel_wakeref_init(&engine->wakeref); +} + +int intel_engines_resume(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err = 0; + + intel_gt_pm_get(i915); + for_each_engine(engine, i915, id) { + intel_engine_pm_get(engine); + engine->serial++; /* kernel context lost */ + err = engine->resume(engine); + intel_engine_pm_put(engine); + if (err) { + dev_err(i915->drm.dev, + "Failed to restart %s (%d)\n", + engine->name, err); + break; + } + } + intel_gt_pm_put(i915); + + return err; +} diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.h b/drivers/gpu/drm/i915/gt/intel_engine_pm.h new file mode 100644 index 000000000000..b326cd993d60 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.h @@ -0,0 +1,22 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_ENGINE_PM_H +#define INTEL_ENGINE_PM_H + +struct drm_i915_private; +struct intel_engine_cs; + +void intel_engine_pm_get(struct intel_engine_cs *engine); +void intel_engine_pm_put(struct intel_engine_cs *engine); + +void intel_engine_park(struct intel_engine_cs *engine); + +void intel_engine_init__pm(struct intel_engine_cs *engine); + +int intel_engines_resume(struct drm_i915_private *i915); + +#endif /* INTEL_ENGINE_PM_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h new file mode 100644 index 000000000000..868b220214f8 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -0,0 +1,568 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_ENGINE_TYPES__ +#define __INTEL_ENGINE_TYPES__ + +#include <linux/hashtable.h> +#include <linux/irq_work.h> +#include <linux/kref.h> +#include <linux/list.h> +#include <linux/llist.h> +#include <linux/types.h> + +#include "i915_gem.h" +#include "i915_gem_batch_pool.h" +#include "i915_pmu.h" +#include "i915_priolist_types.h" +#include "i915_selftest.h" +#include "i915_timeline_types.h" +#include "intel_sseu.h" +#include "intel_wakeref.h" +#include "intel_workarounds_types.h" + +#define I915_MAX_SLICES 3 +#define I915_MAX_SUBSLICES 8 + +#define I915_CMD_HASH_ORDER 9 + +struct dma_fence; +struct drm_i915_gem_object; +struct drm_i915_reg_table; +struct i915_gem_context; +struct i915_request; +struct i915_sched_attr; +struct intel_uncore; + +typedef u8 intel_engine_mask_t; +#define ALL_ENGINES ((intel_engine_mask_t)~0ul) + +struct intel_hw_status_page { + struct i915_vma *vma; + u32 *addr; +}; + +struct intel_instdone { + u32 instdone; + /* The following exist only in the RCS engine */ + u32 slice_common; + u32 sampler[I915_MAX_SLICES][I915_MAX_SUBSLICES]; + u32 row[I915_MAX_SLICES][I915_MAX_SUBSLICES]; +}; + +struct intel_engine_hangcheck { + u64 acthd; + u32 last_ring; + u32 last_head; + unsigned long action_timestamp; + struct intel_instdone instdone; +}; + +struct intel_ring { + struct kref ref; + struct i915_vma *vma; + void *vaddr; + + struct i915_timeline *timeline; + struct list_head request_list; + struct list_head active_link; + + u32 head; + u32 tail; + u32 emit; + + u32 space; + u32 size; + u32 effective_size; +}; + +/* + * we use a single page to load ctx workarounds so all of these + * values are referred in terms of dwords + * + * struct i915_wa_ctx_bb: + * offset: specifies batch starting position, also helpful in case + * if we want to have multiple batches at different offsets based on + * some criteria. It is not a requirement at the moment but provides + * an option for future use. + * size: size of the batch in DWORDS + */ +struct i915_ctx_workarounds { + struct i915_wa_ctx_bb { + u32 offset; + u32 size; + } indirect_ctx, per_ctx; + struct i915_vma *vma; +}; + +#define I915_MAX_VCS 4 +#define I915_MAX_VECS 2 + +/* + * Engine IDs definitions. + * Keep instances of the same type engine together. + */ +enum intel_engine_id { + RCS0 = 0, + BCS0, + VCS0, + VCS1, + VCS2, + VCS3, +#define _VCS(n) (VCS0 + (n)) + VECS0, + VECS1, +#define _VECS(n) (VECS0 + (n)) + I915_NUM_ENGINES +}; + +struct st_preempt_hang { + struct completion completion; + unsigned int count; + bool inject_hang; +}; + +/** + * struct intel_engine_execlists - execlist submission queue and port state + * + * The struct intel_engine_execlists represents the combined logical state of + * driver and the hardware state for execlist mode of submission. + */ +struct intel_engine_execlists { + /** + * @tasklet: softirq tasklet for bottom handler + */ + struct tasklet_struct tasklet; + + /** + * @default_priolist: priority list for I915_PRIORITY_NORMAL + */ + struct i915_priolist default_priolist; + + /** + * @no_priolist: priority lists disabled + */ + bool no_priolist; + + /** + * @submit_reg: gen-specific execlist submission register + * set to the ExecList Submission Port (elsp) register pre-Gen11 and to + * the ExecList Submission Queue Contents register array for Gen11+ + */ + u32 __iomem *submit_reg; + + /** + * @ctrl_reg: the enhanced execlists control register, used to load the + * submit queue on the HW and to request preemptions to idle + */ + u32 __iomem *ctrl_reg; + + /** + * @port: execlist port states + * + * For each hardware ELSP (ExecList Submission Port) we keep + * track of the last request and the number of times we submitted + * that port to hw. We then count the number of times the hw reports + * a context completion or preemption. As only one context can + * be active on hw, we limit resubmission of context to port[0]. This + * is called Lite Restore, of the context. + */ + struct execlist_port { + /** + * @request_count: combined request and submission count + */ + struct i915_request *request_count; +#define EXECLIST_COUNT_BITS 2 +#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS) +#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS) +#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS) +#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS) +#define port_set(p, packed) ((p)->request_count = (packed)) +#define port_isset(p) ((p)->request_count) +#define port_index(p, execlists) ((p) - (execlists)->port) + + /** + * @context_id: context ID for port + */ + GEM_DEBUG_DECL(u32 context_id); + +#define EXECLIST_MAX_PORTS 2 + } port[EXECLIST_MAX_PORTS]; + + /** + * @active: is the HW active? We consider the HW as active after + * submitting any context for execution and until we have seen the + * last context completion event. After that, we do not expect any + * more events until we submit, and so can park the HW. + * + * As we have a small number of different sources from which we feed + * the HW, we track the state of each inside a single bitfield. + */ + unsigned int active; +#define EXECLISTS_ACTIVE_USER 0 +#define EXECLISTS_ACTIVE_PREEMPT 1 +#define EXECLISTS_ACTIVE_HWACK 2 + + /** + * @port_mask: number of execlist ports - 1 + */ + unsigned int port_mask; + + /** + * @queue_priority_hint: Highest pending priority. + * + * When we add requests into the queue, or adjust the priority of + * executing requests, we compute the maximum priority of those + * pending requests. We can then use this value to determine if + * we need to preempt the executing requests to service the queue. + * However, since the we may have recorded the priority of an inflight + * request we wanted to preempt but since completed, at the time of + * dequeuing the priority hint may no longer may match the highest + * available request priority. + */ + int queue_priority_hint; + + /** + * @queue: queue of requests, in priority lists + */ + struct rb_root_cached queue; + struct rb_root_cached virtual; + + /** + * @csb_write: control register for Context Switch buffer + * + * Note this register may be either mmio or HWSP shadow. + */ + u32 *csb_write; + + /** + * @csb_status: status array for Context Switch buffer + * + * Note these register may be either mmio or HWSP shadow. + */ + u32 *csb_status; + + /** + * @preempt_complete_status: expected CSB upon completing preemption + */ + u32 preempt_complete_status; + + /** + * @csb_size: context status buffer FIFO size + */ + u8 csb_size; + + /** + * @csb_head: context status buffer head + */ + u8 csb_head; + + I915_SELFTEST_DECLARE(struct st_preempt_hang preempt_hang;) +}; + +#define INTEL_ENGINE_CS_MAX_NAME 8 + +struct intel_engine_cs { + struct drm_i915_private *i915; + struct intel_uncore *uncore; + char name[INTEL_ENGINE_CS_MAX_NAME]; + + enum intel_engine_id id; + unsigned int hw_id; + unsigned int guc_id; + intel_engine_mask_t mask; + + u8 uabi_class; + + u8 class; + u8 instance; + u32 context_size; + u32 mmio_base; + + u32 uabi_capabilities; + + struct intel_sseu sseu; + + struct intel_ring *buffer; + + struct { + spinlock_t lock; + struct list_head requests; + } active; + + struct llist_head barrier_tasks; + + struct intel_context *kernel_context; /* pinned */ + struct intel_context *preempt_context; /* pinned; optional */ + + intel_engine_mask_t saturated; /* submitting semaphores too late? */ + + unsigned long serial; + + unsigned long wakeref_serial; + struct intel_wakeref wakeref; + struct drm_i915_gem_object *default_state; + void *pinned_default_state; + + /* Rather than have every client wait upon all user interrupts, + * with the herd waking after every interrupt and each doing the + * heavyweight seqno dance, we delegate the task (of being the + * bottom-half of the user interrupt) to the first client. After + * every interrupt, we wake up one client, who does the heavyweight + * coherent seqno read and either goes back to sleep (if incomplete), + * or wakes up all the completed clients in parallel, before then + * transferring the bottom-half status to the next client in the queue. + * + * Compared to walking the entire list of waiters in a single dedicated + * bottom-half, we reduce the latency of the first waiter by avoiding + * a context switch, but incur additional coherent seqno reads when + * following the chain of request breadcrumbs. Since it is most likely + * that we have a single client waiting on each seqno, then reducing + * the overhead of waking that client is much preferred. + */ + struct intel_breadcrumbs { + spinlock_t irq_lock; + struct list_head signalers; + + struct irq_work irq_work; /* for use from inside irq_lock */ + + unsigned int irq_enabled; + + bool irq_armed; + } breadcrumbs; + + struct intel_engine_pmu { + /** + * @enable: Bitmask of enable sample events on this engine. + * + * Bits correspond to sample event types, for instance + * I915_SAMPLE_QUEUED is bit 0 etc. + */ + u32 enable; + /** + * @enable_count: Reference count for the enabled samplers. + * + * Index number corresponds to @enum drm_i915_pmu_engine_sample. + */ + unsigned int enable_count[I915_ENGINE_SAMPLE_COUNT]; + /** + * @sample: Counter values for sampling events. + * + * Our internal timer stores the current counters in this field. + * + * Index number corresponds to @enum drm_i915_pmu_engine_sample. + */ + struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_COUNT]; + } pmu; + + /* + * A pool of objects to use as shadow copies of client batch buffers + * when the command parser is enabled. Prevents the client from + * modifying the batch contents after software parsing. + */ + struct i915_gem_batch_pool batch_pool; + + struct intel_hw_status_page status_page; + struct i915_ctx_workarounds wa_ctx; + struct i915_wa_list ctx_wa_list; + struct i915_wa_list wa_list; + struct i915_wa_list whitelist; + + u32 irq_keep_mask; /* always keep these interrupts */ + u32 irq_enable_mask; /* bitmask to enable ring interrupt */ + void (*irq_enable)(struct intel_engine_cs *engine); + void (*irq_disable)(struct intel_engine_cs *engine); + + int (*resume)(struct intel_engine_cs *engine); + + struct { + void (*prepare)(struct intel_engine_cs *engine); + void (*reset)(struct intel_engine_cs *engine, bool stalled); + void (*finish)(struct intel_engine_cs *engine); + } reset; + + void (*park)(struct intel_engine_cs *engine); + void (*unpark)(struct intel_engine_cs *engine); + + void (*set_default_submission)(struct intel_engine_cs *engine); + + const struct intel_context_ops *cops; + + int (*request_alloc)(struct i915_request *rq); + int (*init_context)(struct i915_request *rq); + + int (*emit_flush)(struct i915_request *request, u32 mode); +#define EMIT_INVALIDATE BIT(0) +#define EMIT_FLUSH BIT(1) +#define EMIT_BARRIER (EMIT_INVALIDATE | EMIT_FLUSH) + int (*emit_bb_start)(struct i915_request *rq, + u64 offset, u32 length, + unsigned int dispatch_flags); +#define I915_DISPATCH_SECURE BIT(0) +#define I915_DISPATCH_PINNED BIT(1) + int (*emit_init_breadcrumb)(struct i915_request *rq); + u32 *(*emit_fini_breadcrumb)(struct i915_request *rq, + u32 *cs); + unsigned int emit_fini_breadcrumb_dw; + + /* Pass the request to the hardware queue (e.g. directly into + * the legacy ringbuffer or to the end of an execlist). + * + * This is called from an atomic context with irqs disabled; must + * be irq safe. + */ + void (*submit_request)(struct i915_request *rq); + + /* + * Called on signaling of a SUBMIT_FENCE, passing along the signaling + * request down to the bonded pairs. + */ + void (*bond_execute)(struct i915_request *rq, + struct dma_fence *signal); + + /* + * Call when the priority on a request has changed and it and its + * dependencies may need rescheduling. Note the request itself may + * not be ready to run! + */ + void (*schedule)(struct i915_request *request, + const struct i915_sched_attr *attr); + + /* + * Cancel all requests on the hardware, or queued for execution. + * This should only cancel the ready requests that have been + * submitted to the engine (via the engine->submit_request callback). + * This is called when marking the device as wedged. + */ + void (*cancel_requests)(struct intel_engine_cs *engine); + + void (*destroy)(struct intel_engine_cs *engine); + + struct intel_engine_execlists execlists; + + /* status_notifier: list of callbacks for context-switch changes */ + struct atomic_notifier_head context_status_notifier; + + struct intel_engine_hangcheck hangcheck; + +#define I915_ENGINE_NEEDS_CMD_PARSER BIT(0) +#define I915_ENGINE_SUPPORTS_STATS BIT(1) +#define I915_ENGINE_HAS_PREEMPTION BIT(2) +#define I915_ENGINE_HAS_SEMAPHORES BIT(3) +#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4) +#define I915_ENGINE_IS_VIRTUAL BIT(5) + unsigned int flags; + + /* + * Table of commands the command parser needs to know about + * for this engine. + */ + DECLARE_HASHTABLE(cmd_hash, I915_CMD_HASH_ORDER); + + /* + * Table of registers allowed in commands that read/write registers. + */ + const struct drm_i915_reg_table *reg_tables; + int reg_table_count; + + /* + * Returns the bitmask for the length field of the specified command. + * Return 0 for an unrecognized/invalid command. + * + * If the command parser finds an entry for a command in the engine's + * cmd_tables, it gets the command's length based on the table entry. + * If not, it calls this function to determine the per-engine length + * field encoding for the command (i.e. different opcode ranges use + * certain bits to encode the command length in the header). + */ + u32 (*get_cmd_length_mask)(u32 cmd_header); + + struct { + /** + * @lock: Lock protecting the below fields. + */ + seqlock_t lock; + /** + * @enabled: Reference count indicating number of listeners. + */ + unsigned int enabled; + /** + * @active: Number of contexts currently scheduled in. + */ + unsigned int active; + /** + * @enabled_at: Timestamp when busy stats were enabled. + */ + ktime_t enabled_at; + /** + * @start: Timestamp of the last idle to active transition. + * + * Idle is defined as active == 0, active is active > 0. + */ + ktime_t start; + /** + * @total: Total time this engine was busy. + * + * Accumulated time not counting the most recent block in cases + * where engine is currently busy (active > 0). + */ + ktime_t total; + } stats; +}; + +static inline bool +intel_engine_needs_cmd_parser(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_NEEDS_CMD_PARSER; +} + +static inline bool +intel_engine_supports_stats(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_SUPPORTS_STATS; +} + +static inline bool +intel_engine_has_preemption(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_HAS_PREEMPTION; +} + +static inline bool +intel_engine_has_semaphores(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_HAS_SEMAPHORES; +} + +static inline bool +intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_NEEDS_BREADCRUMB_TASKLET; +} + +static inline bool +intel_engine_is_virtual(const struct intel_engine_cs *engine) +{ + return engine->flags & I915_ENGINE_IS_VIRTUAL; +} + +#define instdone_slice_mask(dev_priv__) \ + (IS_GEN(dev_priv__, 7) ? \ + 1 : RUNTIME_INFO(dev_priv__)->sseu.slice_mask) + +#define instdone_subslice_mask(dev_priv__) \ + (IS_GEN(dev_priv__, 7) ? \ + 1 : RUNTIME_INFO(dev_priv__)->sseu.subslice_mask[0]) + +#define for_each_instdone_slice_subslice(dev_priv__, slice__, subslice__) \ + for ((slice__) = 0, (subslice__) = 0; \ + (slice__) < I915_MAX_SLICES; \ + (subslice__) = ((subslice__) + 1) < I915_MAX_SUBSLICES ? (subslice__) + 1 : 0, \ + (slice__) += ((subslice__) == 0)) \ + for_each_if((BIT(slice__) & instdone_slice_mask(dev_priv__)) && \ + (BIT(subslice__) & instdone_subslice_mask(dev_priv__))) + +#endif /* __INTEL_ENGINE_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h new file mode 100644 index 000000000000..eec31e36aca7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h @@ -0,0 +1,279 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright � 2003-2018 Intel Corporation + */ + +#ifndef _INTEL_GPU_COMMANDS_H_ +#define _INTEL_GPU_COMMANDS_H_ + +/* + * Instruction field definitions used by the command parser + */ +#define INSTR_CLIENT_SHIFT 29 +#define INSTR_MI_CLIENT 0x0 +#define INSTR_BC_CLIENT 0x2 +#define INSTR_RC_CLIENT 0x3 +#define INSTR_SUBCLIENT_SHIFT 27 +#define INSTR_SUBCLIENT_MASK 0x18000000 +#define INSTR_MEDIA_SUBCLIENT 0x2 +#define INSTR_26_TO_24_MASK 0x7000000 +#define INSTR_26_TO_24_SHIFT 24 + +/* + * Memory interface instructions used by the kernel + */ +#define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) +/* Many MI commands use bit 22 of the header dword for GGTT vs PPGTT */ +#define MI_GLOBAL_GTT (1<<22) + +#define MI_NOOP MI_INSTR(0, 0) +#define MI_USER_INTERRUPT MI_INSTR(0x02, 0) +#define MI_WAIT_FOR_EVENT MI_INSTR(0x03, 0) +#define MI_WAIT_FOR_OVERLAY_FLIP (1<<16) +#define MI_WAIT_FOR_PLANE_B_FLIP (1<<6) +#define MI_WAIT_FOR_PLANE_A_FLIP (1<<2) +#define MI_WAIT_FOR_PLANE_A_SCANLINES (1<<1) +#define MI_FLUSH MI_INSTR(0x04, 0) +#define MI_READ_FLUSH (1 << 0) +#define MI_EXE_FLUSH (1 << 1) +#define MI_NO_WRITE_FLUSH (1 << 2) +#define MI_SCENE_COUNT (1 << 3) /* just increment scene count */ +#define MI_END_SCENE (1 << 4) /* flush binner and incr scene count */ +#define MI_INVALIDATE_ISP (1 << 5) /* invalidate indirect state pointers */ +#define MI_REPORT_HEAD MI_INSTR(0x07, 0) +#define MI_ARB_ON_OFF MI_INSTR(0x08, 0) +#define MI_ARB_ENABLE (1<<0) +#define MI_ARB_DISABLE (0<<0) +#define MI_BATCH_BUFFER_END MI_INSTR(0x0a, 0) +#define MI_SUSPEND_FLUSH MI_INSTR(0x0b, 0) +#define MI_SUSPEND_FLUSH_EN (1<<0) +#define MI_SET_APPID MI_INSTR(0x0e, 0) +#define MI_OVERLAY_FLIP MI_INSTR(0x11, 0) +#define MI_OVERLAY_CONTINUE (0x0<<21) +#define MI_OVERLAY_ON (0x1<<21) +#define MI_OVERLAY_OFF (0x2<<21) +#define MI_LOAD_SCAN_LINES_INCL MI_INSTR(0x12, 0) +#define MI_DISPLAY_FLIP MI_INSTR(0x14, 2) +#define MI_DISPLAY_FLIP_I915 MI_INSTR(0x14, 1) +#define MI_DISPLAY_FLIP_PLANE(n) ((n) << 20) +/* IVB has funny definitions for which plane to flip. */ +#define MI_DISPLAY_FLIP_IVB_PLANE_A (0 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_B (1 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_A (2 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_B (3 << 19) +#define MI_DISPLAY_FLIP_IVB_PLANE_C (4 << 19) +#define MI_DISPLAY_FLIP_IVB_SPRITE_C (5 << 19) +/* SKL ones */ +#define MI_DISPLAY_FLIP_SKL_PLANE_1_A (0 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_1_B (1 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_1_C (2 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_A (4 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_B (5 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_2_C (6 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_A (7 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_B (8 << 8) +#define MI_DISPLAY_FLIP_SKL_PLANE_3_C (9 << 8) +#define MI_SEMAPHORE_MBOX MI_INSTR(0x16, 1) /* gen6, gen7 */ +#define MI_SEMAPHORE_GLOBAL_GTT (1<<22) +#define MI_SEMAPHORE_UPDATE (1<<21) +#define MI_SEMAPHORE_COMPARE (1<<20) +#define MI_SEMAPHORE_REGISTER (1<<18) +#define MI_SEMAPHORE_SYNC_VR (0<<16) /* RCS wait for VCS (RVSYNC) */ +#define MI_SEMAPHORE_SYNC_VER (1<<16) /* RCS wait for VECS (RVESYNC) */ +#define MI_SEMAPHORE_SYNC_BR (2<<16) /* RCS wait for BCS (RBSYNC) */ +#define MI_SEMAPHORE_SYNC_BV (0<<16) /* VCS wait for BCS (VBSYNC) */ +#define MI_SEMAPHORE_SYNC_VEV (1<<16) /* VCS wait for VECS (VVESYNC) */ +#define MI_SEMAPHORE_SYNC_RV (2<<16) /* VCS wait for RCS (VRSYNC) */ +#define MI_SEMAPHORE_SYNC_RB (0<<16) /* BCS wait for RCS (BRSYNC) */ +#define MI_SEMAPHORE_SYNC_VEB (1<<16) /* BCS wait for VECS (BVESYNC) */ +#define MI_SEMAPHORE_SYNC_VB (2<<16) /* BCS wait for VCS (BVSYNC) */ +#define MI_SEMAPHORE_SYNC_BVE (0<<16) /* VECS wait for BCS (VEBSYNC) */ +#define MI_SEMAPHORE_SYNC_VVE (1<<16) /* VECS wait for VCS (VEVSYNC) */ +#define MI_SEMAPHORE_SYNC_RVE (2<<16) /* VECS wait for RCS (VERSYNC) */ +#define MI_SEMAPHORE_SYNC_INVALID (3<<16) +#define MI_SEMAPHORE_SYNC_MASK (3<<16) +#define MI_SET_CONTEXT MI_INSTR(0x18, 0) +#define MI_MM_SPACE_GTT (1<<8) +#define MI_MM_SPACE_PHYSICAL (0<<8) +#define MI_SAVE_EXT_STATE_EN (1<<3) +#define MI_RESTORE_EXT_STATE_EN (1<<2) +#define MI_FORCE_RESTORE (1<<1) +#define MI_RESTORE_INHIBIT (1<<0) +#define HSW_MI_RS_SAVE_STATE_EN (1<<3) +#define HSW_MI_RS_RESTORE_STATE_EN (1<<2) +#define MI_SEMAPHORE_SIGNAL MI_INSTR(0x1b, 0) /* GEN8+ */ +#define MI_SEMAPHORE_TARGET(engine) ((engine)<<15) +#define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */ +#define MI_SEMAPHORE_POLL (1 << 15) +#define MI_SEMAPHORE_SAD_GT_SDD (0 << 12) +#define MI_SEMAPHORE_SAD_GTE_SDD (1 << 12) +#define MI_SEMAPHORE_SAD_LT_SDD (2 << 12) +#define MI_SEMAPHORE_SAD_LTE_SDD (3 << 12) +#define MI_SEMAPHORE_SAD_EQ_SDD (4 << 12) +#define MI_SEMAPHORE_SAD_NEQ_SDD (5 << 12) +#define MI_STORE_DWORD_IMM MI_INSTR(0x20, 1) +#define MI_STORE_DWORD_IMM_GEN4 MI_INSTR(0x20, 2) +#define MI_MEM_VIRTUAL (1 << 22) /* 945,g33,965 */ +#define MI_USE_GGTT (1 << 22) /* g4x+ */ +#define MI_STORE_DWORD_INDEX MI_INSTR(0x21, 1) +/* + * Official intel docs are somewhat sloppy concerning MI_LOAD_REGISTER_IMM: + * - Always issue a MI_NOOP _before_ the MI_LOAD_REGISTER_IMM - otherwise hw + * simply ignores the register load under certain conditions. + * - One can actually load arbitrary many arbitrary registers: Simply issue x + * address/value pairs. Don't overdue it, though, x <= 2^4 must hold! + */ +#define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1) +#define MI_LRI_FORCE_POSTED (1<<12) +#define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1) +#define MI_STORE_REGISTER_MEM_GEN8 MI_INSTR(0x24, 2) +#define MI_SRM_LRM_GLOBAL_GTT (1<<22) +#define MI_FLUSH_DW MI_INSTR(0x26, 1) /* for GEN6 */ +#define MI_FLUSH_DW_STORE_INDEX (1<<21) +#define MI_INVALIDATE_TLB (1<<18) +#define MI_FLUSH_DW_OP_STOREDW (1<<14) +#define MI_FLUSH_DW_OP_MASK (3<<14) +#define MI_FLUSH_DW_NOTIFY (1<<8) +#define MI_INVALIDATE_BSD (1<<7) +#define MI_FLUSH_DW_USE_GTT (1<<2) +#define MI_FLUSH_DW_USE_PPGTT (0<<2) +#define MI_LOAD_REGISTER_MEM MI_INSTR(0x29, 1) +#define MI_LOAD_REGISTER_MEM_GEN8 MI_INSTR(0x29, 2) +#define MI_BATCH_BUFFER MI_INSTR(0x30, 1) +#define MI_BATCH_NON_SECURE (1) +/* for snb/ivb/vlv this also means "batch in ppgtt" when ppgtt is enabled. */ +#define MI_BATCH_NON_SECURE_I965 (1<<8) +#define MI_BATCH_PPGTT_HSW (1<<8) +#define MI_BATCH_NON_SECURE_HSW (1<<13) +#define MI_BATCH_BUFFER_START MI_INSTR(0x31, 0) +#define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ +#define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) +#define MI_BATCH_RESOURCE_STREAMER (1<<10) + +/* + * 3D instructions used by the kernel + */ +#define GFX_INSTR(opcode, flags) ((0x3 << 29) | ((opcode) << 24) | (flags)) + +#define GEN9_MEDIA_POOL_STATE ((0x3 << 29) | (0x2 << 27) | (0x5 << 16) | 4) +#define GEN9_MEDIA_POOL_ENABLE (1 << 31) +#define GFX_OP_RASTER_RULES ((0x3<<29)|(0x7<<24)) +#define GFX_OP_SCISSOR ((0x3<<29)|(0x1c<<24)|(0x10<<19)) +#define SC_UPDATE_SCISSOR (0x1<<1) +#define SC_ENABLE_MASK (0x1<<0) +#define SC_ENABLE (0x1<<0) +#define GFX_OP_LOAD_INDIRECT ((0x3<<29)|(0x1d<<24)|(0x7<<16)) +#define GFX_OP_SCISSOR_INFO ((0x3<<29)|(0x1d<<24)|(0x81<<16)|(0x1)) +#define SCI_YMIN_MASK (0xffff<<16) +#define SCI_XMIN_MASK (0xffff<<0) +#define SCI_YMAX_MASK (0xffff<<16) +#define SCI_XMAX_MASK (0xffff<<0) +#define GFX_OP_SCISSOR_ENABLE ((0x3<<29)|(0x1c<<24)|(0x10<<19)) +#define GFX_OP_SCISSOR_RECT ((0x3<<29)|(0x1d<<24)|(0x81<<16)|1) +#define GFX_OP_COLOR_FACTOR ((0x3<<29)|(0x1d<<24)|(0x1<<16)|0x0) +#define GFX_OP_STIPPLE ((0x3<<29)|(0x1d<<24)|(0x83<<16)) +#define GFX_OP_MAP_INFO ((0x3<<29)|(0x1d<<24)|0x4) +#define GFX_OP_DESTBUFFER_VARS ((0x3<<29)|(0x1d<<24)|(0x85<<16)|0x0) +#define GFX_OP_DESTBUFFER_INFO ((0x3<<29)|(0x1d<<24)|(0x8e<<16)|1) +#define GFX_OP_DRAWRECT_INFO ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3)) +#define GFX_OP_DRAWRECT_INFO_I965 ((0x7900<<16)|0x2) + +#define COLOR_BLT_CMD (2<<29 | 0x40<<22 | (5-2)) +#define XY_COLOR_BLT_CMD (2 << 29 | 0x50 << 22) +#define SRC_COPY_BLT_CMD ((2<<29)|(0x43<<22)|4) +#define XY_SRC_COPY_BLT_CMD ((2<<29)|(0x53<<22)|6) +#define XY_MONO_SRC_COPY_IMM_BLT ((2<<29)|(0x71<<22)|5) +#define BLT_WRITE_A (2<<20) +#define BLT_WRITE_RGB (1<<20) +#define BLT_WRITE_RGBA (BLT_WRITE_RGB | BLT_WRITE_A) +#define BLT_DEPTH_8 (0<<24) +#define BLT_DEPTH_16_565 (1<<24) +#define BLT_DEPTH_16_1555 (2<<24) +#define BLT_DEPTH_32 (3<<24) +#define BLT_ROP_SRC_COPY (0xcc<<16) +#define BLT_ROP_COLOR_COPY (0xf0<<16) +#define XY_SRC_COPY_BLT_SRC_TILED (1<<15) /* 965+ only */ +#define XY_SRC_COPY_BLT_DST_TILED (1<<11) /* 965+ only */ +#define CMD_OP_DISPLAYBUFFER_INFO ((0x0<<29)|(0x14<<23)|2) +#define ASYNC_FLIP (1<<22) +#define DISPLAY_PLANE_A (0<<20) +#define DISPLAY_PLANE_B (1<<20) +#define GFX_OP_PIPE_CONTROL(len) ((0x3<<29)|(0x3<<27)|(0x2<<24)|((len)-2)) +#define PIPE_CONTROL_FLUSH_L3 (1<<27) +#define PIPE_CONTROL_GLOBAL_GTT_IVB (1<<24) /* gen7+ */ +#define PIPE_CONTROL_MMIO_WRITE (1<<23) +#define PIPE_CONTROL_STORE_DATA_INDEX (1<<21) +#define PIPE_CONTROL_CS_STALL (1<<20) +#define PIPE_CONTROL_TLB_INVALIDATE (1<<18) +#define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16) +#define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) +#define PIPE_CONTROL_DEPTH_STALL (1<<13) +#define PIPE_CONTROL_WRITE_FLUSH (1<<12) +#define PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH (1<<12) /* gen6+ */ +#define PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE (1<<11) /* MBZ on ILK */ +#define PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE (1<<10) /* GM45+ only */ +#define PIPE_CONTROL_INDIRECT_STATE_DISABLE (1<<9) +#define PIPE_CONTROL_NOTIFY (1<<8) +#define PIPE_CONTROL_FLUSH_ENABLE (1<<7) /* gen7+ */ +#define PIPE_CONTROL_DC_FLUSH_ENABLE (1<<5) +#define PIPE_CONTROL_VF_CACHE_INVALIDATE (1<<4) +#define PIPE_CONTROL_CONST_CACHE_INVALIDATE (1<<3) +#define PIPE_CONTROL_STATE_CACHE_INVALIDATE (1<<2) +#define PIPE_CONTROL_STALL_AT_SCOREBOARD (1<<1) +#define PIPE_CONTROL_DEPTH_CACHE_FLUSH (1<<0) +#define PIPE_CONTROL_GLOBAL_GTT (1<<2) /* in addr dword */ + +/* + * Commands used only by the command parser + */ +#define MI_SET_PREDICATE MI_INSTR(0x01, 0) +#define MI_ARB_CHECK MI_INSTR(0x05, 0) +#define MI_RS_CONTROL MI_INSTR(0x06, 0) +#define MI_URB_ATOMIC_ALLOC MI_INSTR(0x09, 0) +#define MI_PREDICATE MI_INSTR(0x0C, 0) +#define MI_RS_CONTEXT MI_INSTR(0x0F, 0) +#define MI_TOPOLOGY_FILTER MI_INSTR(0x0D, 0) +#define MI_LOAD_SCAN_LINES_EXCL MI_INSTR(0x13, 0) +#define MI_URB_CLEAR MI_INSTR(0x19, 0) +#define MI_UPDATE_GTT MI_INSTR(0x23, 0) +#define MI_CLFLUSH MI_INSTR(0x27, 0) +#define MI_REPORT_PERF_COUNT MI_INSTR(0x28, 0) +#define MI_REPORT_PERF_COUNT_GGTT (1<<0) +#define MI_LOAD_REGISTER_REG MI_INSTR(0x2A, 0) +#define MI_RS_STORE_DATA_IMM MI_INSTR(0x2B, 0) +#define MI_LOAD_URB_MEM MI_INSTR(0x2C, 0) +#define MI_STORE_URB_MEM MI_INSTR(0x2D, 0) +#define MI_CONDITIONAL_BATCH_BUFFER_END MI_INSTR(0x36, 0) + +#define PIPELINE_SELECT ((0x3<<29)|(0x1<<27)|(0x1<<24)|(0x4<<16)) +#define GFX_OP_3DSTATE_VF_STATISTICS ((0x3<<29)|(0x1<<27)|(0x0<<24)|(0xB<<16)) +#define MEDIA_VFE_STATE ((0x3<<29)|(0x2<<27)|(0x0<<24)|(0x0<<16)) +#define MEDIA_VFE_STATE_MMIO_ACCESS_MASK (0x18) +#define GPGPU_OBJECT ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x4<<16)) +#define GPGPU_WALKER ((0x3<<29)|(0x2<<27)|(0x1<<24)|(0x5<<16)) +#define GFX_OP_3DSTATE_DX9_CONSTANTF_VS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x39<<16)) +#define GFX_OP_3DSTATE_DX9_CONSTANTF_PS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x3A<<16)) +#define GFX_OP_3DSTATE_SO_DECL_LIST \ + ((0x3<<29)|(0x3<<27)|(0x1<<24)|(0x17<<16)) + +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_VS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x43<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_GS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x44<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_HS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x45<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_DS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x46<<16)) +#define GFX_OP_3DSTATE_BINDING_TABLE_EDIT_PS \ + ((0x3<<29)|(0x3<<27)|(0x0<<24)|(0x47<<16)) + +#define MFX_WAIT ((0x3<<29)|(0x1<<27)|(0x0<<16)) + +#define COLOR_BLT ((0x2<<29)|(0x40<<22)) +#define SRC_COPY_BLT ((0x2<<29)|(0x43<<22)) + +#endif /* _INTEL_GPU_COMMANDS_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.c b/drivers/gpu/drm/i915/gt/intel_gt_pm.c new file mode 100644 index 000000000000..7b5967751762 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.c @@ -0,0 +1,143 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_gt_pm.h" +#include "intel_pm.h" +#include "intel_wakeref.h" + +static void pm_notify(struct drm_i915_private *i915, int state) +{ + blocking_notifier_call_chain(&i915->gt.pm_notifications, state, i915); +} + +static int intel_gt_unpark(struct intel_wakeref *wf) +{ + struct drm_i915_private *i915 = + container_of(wf, typeof(*i915), gt.wakeref); + + GEM_TRACE("\n"); + + /* + * It seems that the DMC likes to transition between the DC states a lot + * when there are no connected displays (no active power domains) during + * command submission. + * + * This activity has negative impact on the performance of the chip with + * huge latencies observed in the interrupt handler and elsewhere. + * + * Work around it by grabbing a GT IRQ power domain whilst there is any + * GT activity, preventing any DC state transitions. + */ + i915->gt.awake = intel_display_power_get(i915, POWER_DOMAIN_GT_IRQ); + GEM_BUG_ON(!i915->gt.awake); + + intel_enable_gt_powersave(i915); + + i915_update_gfx_val(i915); + if (INTEL_GEN(i915) >= 6) + gen6_rps_busy(i915); + + i915_pmu_gt_unparked(i915); + + i915_queue_hangcheck(i915); + + pm_notify(i915, INTEL_GT_UNPARK); + + return 0; +} + +void intel_gt_pm_get(struct drm_i915_private *i915) +{ + intel_wakeref_get(&i915->runtime_pm, &i915->gt.wakeref, intel_gt_unpark); +} + +static int intel_gt_park(struct intel_wakeref *wf) +{ + struct drm_i915_private *i915 = + container_of(wf, typeof(*i915), gt.wakeref); + intel_wakeref_t wakeref = fetch_and_zero(&i915->gt.awake); + + GEM_TRACE("\n"); + + pm_notify(i915, INTEL_GT_PARK); + + i915_pmu_gt_parked(i915); + if (INTEL_GEN(i915) >= 6) + gen6_rps_idle(i915); + + GEM_BUG_ON(!wakeref); + intel_display_power_put(i915, POWER_DOMAIN_GT_IRQ, wakeref); + + return 0; +} + +void intel_gt_pm_put(struct drm_i915_private *i915) +{ + intel_wakeref_put(&i915->runtime_pm, &i915->gt.wakeref, intel_gt_park); +} + +void intel_gt_pm_init(struct drm_i915_private *i915) +{ + intel_wakeref_init(&i915->gt.wakeref); + BLOCKING_INIT_NOTIFIER_HEAD(&i915->gt.pm_notifications); +} + +static bool reset_engines(struct drm_i915_private *i915) +{ + if (INTEL_INFO(i915)->gpu_reset_clobbers_display) + return false; + + return intel_gpu_reset(i915, ALL_ENGINES) == 0; +} + +/** + * intel_gt_sanitize: called after the GPU has lost power + * @i915: the i915 device + * @force: ignore a failed reset and sanitize engine state anyway + * + * Anytime we reset the GPU, either with an explicit GPU reset or through a + * PCI power cycle, the GPU loses state and we must reset our state tracking + * to match. Note that calling intel_gt_sanitize() if the GPU has not + * been reset results in much confusion! + */ +void intel_gt_sanitize(struct drm_i915_private *i915, bool force) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + GEM_TRACE("\n"); + + if (!reset_engines(i915) && !force) + return; + + for_each_engine(engine, i915, id) + intel_engine_reset(engine, false); +} + +void intel_gt_resume(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * After resume, we may need to poke into the pinned kernel + * contexts to paper over any damage caused by the sudden suspend. + * Only the kernel contexts should remain pinned over suspend, + * allowing us to fixup the user contexts on their first pin. + */ + for_each_engine(engine, i915, id) { + struct intel_context *ce; + + ce = engine->kernel_context; + if (ce) + ce->ops->reset(ce); + + ce = engine->preempt_context; + if (ce) + ce->ops->reset(ce); + } +} diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h new file mode 100644 index 000000000000..7dd1130a19a4 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h @@ -0,0 +1,27 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef INTEL_GT_PM_H +#define INTEL_GT_PM_H + +#include <linux/types.h> + +struct drm_i915_private; + +enum { + INTEL_GT_UNPARK, + INTEL_GT_PARK, +}; + +void intel_gt_pm_get(struct drm_i915_private *i915); +void intel_gt_pm_put(struct drm_i915_private *i915); + +void intel_gt_pm_init(struct drm_i915_private *i915); + +void intel_gt_sanitize(struct drm_i915_private *i915, bool force); +void intel_gt_resume(struct drm_i915_private *i915); + +#endif /* INTEL_GT_PM_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_hangcheck.c b/drivers/gpu/drm/i915/gt/intel_hangcheck.c new file mode 100644 index 000000000000..6bcfa6456c45 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_hangcheck.c @@ -0,0 +1,347 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "intel_reset.h" +#include "i915_drv.h" + +struct hangcheck { + u64 acthd; + u32 ring; + u32 head; + enum intel_engine_hangcheck_action action; + unsigned long action_timestamp; + int deadlock; + struct intel_instdone instdone; + bool wedged:1; + bool stalled:1; +}; + +static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone) +{ + u32 tmp = current_instdone | *old_instdone; + bool unchanged; + + unchanged = tmp == *old_instdone; + *old_instdone |= tmp; + + return unchanged; +} + +static bool subunits_stuck(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + struct intel_instdone instdone; + struct intel_instdone *accu_instdone = &engine->hangcheck.instdone; + bool stuck; + int slice; + int subslice; + + if (engine->id != RCS0) + return true; + + intel_engine_get_instdone(engine, &instdone); + + /* There might be unstable subunit states even when + * actual head is not moving. Filter out the unstable ones by + * accumulating the undone -> done transitions and only + * consider those as progress. + */ + stuck = instdone_unchanged(instdone.instdone, + &accu_instdone->instdone); + stuck &= instdone_unchanged(instdone.slice_common, + &accu_instdone->slice_common); + + for_each_instdone_slice_subslice(dev_priv, slice, subslice) { + stuck &= instdone_unchanged(instdone.sampler[slice][subslice], + &accu_instdone->sampler[slice][subslice]); + stuck &= instdone_unchanged(instdone.row[slice][subslice], + &accu_instdone->row[slice][subslice]); + } + + return stuck; +} + +static enum intel_engine_hangcheck_action +head_stuck(struct intel_engine_cs *engine, u64 acthd) +{ + if (acthd != engine->hangcheck.acthd) { + + /* Clear subunit states on head movement */ + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); + + return ENGINE_ACTIVE_HEAD; + } + + if (!subunits_stuck(engine)) + return ENGINE_ACTIVE_SUBUNITS; + + return ENGINE_DEAD; +} + +static enum intel_engine_hangcheck_action +engine_stuck(struct intel_engine_cs *engine, u64 acthd) +{ + struct drm_i915_private *dev_priv = engine->i915; + enum intel_engine_hangcheck_action ha; + u32 tmp; + + ha = head_stuck(engine, acthd); + if (ha != ENGINE_DEAD) + return ha; + + if (IS_GEN(dev_priv, 2)) + return ENGINE_DEAD; + + /* Is the chip hanging on a WAIT_FOR_EVENT? + * If so we can simply poke the RB_WAIT bit + * and break the hang. This should work on + * all but the second generation chipsets. + */ + tmp = ENGINE_READ(engine, RING_CTL); + if (tmp & RING_WAIT) { + i915_handle_error(dev_priv, engine->mask, 0, + "stuck wait on %s", engine->name); + ENGINE_WRITE(engine, RING_CTL, tmp); + return ENGINE_WAIT_KICK; + } + + return ENGINE_DEAD; +} + +static void hangcheck_load_sample(struct intel_engine_cs *engine, + struct hangcheck *hc) +{ + hc->acthd = intel_engine_get_active_head(engine); + hc->ring = ENGINE_READ(engine, RING_START); + hc->head = ENGINE_READ(engine, RING_HEAD); +} + +static void hangcheck_store_sample(struct intel_engine_cs *engine, + const struct hangcheck *hc) +{ + engine->hangcheck.acthd = hc->acthd; + engine->hangcheck.last_ring = hc->ring; + engine->hangcheck.last_head = hc->head; +} + +static enum intel_engine_hangcheck_action +hangcheck_get_action(struct intel_engine_cs *engine, + const struct hangcheck *hc) +{ + if (intel_engine_is_idle(engine)) + return ENGINE_IDLE; + + if (engine->hangcheck.last_ring != hc->ring) + return ENGINE_ACTIVE_SEQNO; + + if (engine->hangcheck.last_head != hc->head) + return ENGINE_ACTIVE_SEQNO; + + return engine_stuck(engine, hc->acthd); +} + +static void hangcheck_accumulate_sample(struct intel_engine_cs *engine, + struct hangcheck *hc) +{ + unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT; + + hc->action = hangcheck_get_action(engine, hc); + + /* We always increment the progress + * if the engine is busy and still processing + * the same request, so that no single request + * can run indefinitely (such as a chain of + * batches). The only time we do not increment + * the hangcheck score on this ring, if this + * engine is in a legitimate wait for another + * engine. In that case the waiting engine is a + * victim and we want to be sure we catch the + * right culprit. Then every time we do kick + * the ring, make it as a progress as the seqno + * advancement might ensure and if not, it + * will catch the hanging engine. + */ + + switch (hc->action) { + case ENGINE_IDLE: + case ENGINE_ACTIVE_SEQNO: + /* Clear head and subunit states on seqno movement */ + hc->acthd = 0; + + memset(&engine->hangcheck.instdone, 0, + sizeof(engine->hangcheck.instdone)); + + /* Intentional fall through */ + case ENGINE_WAIT_KICK: + case ENGINE_WAIT: + engine->hangcheck.action_timestamp = jiffies; + break; + + case ENGINE_ACTIVE_HEAD: + case ENGINE_ACTIVE_SUBUNITS: + /* + * Seqno stuck with still active engine gets leeway, + * in hopes that it is just a long shader. + */ + timeout = I915_SEQNO_DEAD_TIMEOUT; + break; + + case ENGINE_DEAD: + break; + + default: + MISSING_CASE(hc->action); + } + + hc->stalled = time_after(jiffies, + engine->hangcheck.action_timestamp + timeout); + hc->wedged = time_after(jiffies, + engine->hangcheck.action_timestamp + + I915_ENGINE_WEDGED_TIMEOUT); +} + +static void hangcheck_declare_hang(struct drm_i915_private *i915, + intel_engine_mask_t hung, + intel_engine_mask_t stuck) +{ + struct intel_engine_cs *engine; + intel_engine_mask_t tmp; + char msg[80]; + int len; + + /* If some rings hung but others were still busy, only + * blame the hanging rings in the synopsis. + */ + if (stuck != hung) + hung &= ~stuck; + len = scnprintf(msg, sizeof(msg), + "%s on ", stuck == hung ? "no progress" : "hang"); + for_each_engine_masked(engine, i915, hung, tmp) + len += scnprintf(msg + len, sizeof(msg) - len, + "%s, ", engine->name); + msg[len-2] = '\0'; + + return i915_handle_error(i915, hung, I915_ERROR_CAPTURE, "%s", msg); +} + +/* + * This is called when the chip hasn't reported back with completed + * batchbuffers in a long time. We keep track per ring seqno progress and + * if there are no progress, hangcheck score for that ring is increased. + * Further, acthd is inspected to see if the ring is stuck. On stuck case + * we kick the ring. If we see no progress on three subsequent calls + * we assume chip is wedged and try to fix it by resetting the chip. + */ +static void i915_hangcheck_elapsed(struct work_struct *work) +{ + struct drm_i915_private *dev_priv = + container_of(work, typeof(*dev_priv), + gpu_error.hangcheck_work.work); + intel_engine_mask_t hung = 0, stuck = 0, wedged = 0; + struct intel_engine_cs *engine; + enum intel_engine_id id; + intel_wakeref_t wakeref; + + if (!i915_modparams.enable_hangcheck) + return; + + if (!READ_ONCE(dev_priv->gt.awake)) + return; + + if (i915_terminally_wedged(dev_priv)) + return; + + wakeref = intel_runtime_pm_get_if_in_use(&dev_priv->runtime_pm); + if (!wakeref) + return; + + /* As enabling the GPU requires fairly extensive mmio access, + * periodically arm the mmio checker to see if we are triggering + * any invalid access. + */ + intel_uncore_arm_unclaimed_mmio_detection(&dev_priv->uncore); + + for_each_engine(engine, dev_priv, id) { + struct hangcheck hc; + + intel_engine_signal_breadcrumbs(engine); + + hangcheck_load_sample(engine, &hc); + hangcheck_accumulate_sample(engine, &hc); + hangcheck_store_sample(engine, &hc); + + if (hc.stalled) { + hung |= engine->mask; + if (hc.action != ENGINE_DEAD) + stuck |= engine->mask; + } + + if (hc.wedged) + wedged |= engine->mask; + } + + if (GEM_SHOW_DEBUG() && (hung | stuck)) { + struct drm_printer p = drm_debug_printer("hangcheck"); + + for_each_engine(engine, dev_priv, id) { + if (intel_engine_is_idle(engine)) + continue; + + intel_engine_dump(engine, &p, "%s\n", engine->name); + } + } + + if (wedged) { + dev_err(dev_priv->drm.dev, + "GPU recovery timed out," + " cancelling all in-flight rendering.\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(dev_priv); + } + + if (hung) + hangcheck_declare_hang(dev_priv, hung, stuck); + + intel_runtime_pm_put(&dev_priv->runtime_pm, wakeref); + + /* Reset timer in case GPU hangs without another request being added */ + i915_queue_hangcheck(dev_priv); +} + +void intel_engine_init_hangcheck(struct intel_engine_cs *engine) +{ + memset(&engine->hangcheck, 0, sizeof(engine->hangcheck)); + engine->hangcheck.action_timestamp = jiffies; +} + +void intel_hangcheck_init(struct drm_i915_private *i915) +{ + INIT_DELAYED_WORK(&i915->gpu_error.hangcheck_work, + i915_hangcheck_elapsed); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_hangcheck.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c new file mode 100644 index 000000000000..b42b5f158295 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -0,0 +1,3591 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Ben Widawsky <ben@bwidawsk.net> + * Michel Thierry <michel.thierry@intel.com> + * Thomas Daniel <thomas.daniel@intel.com> + * Oscar Mateo <oscar.mateo@intel.com> + * + */ + +/** + * DOC: Logical Rings, Logical Ring Contexts and Execlists + * + * Motivation: + * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts". + * These expanded contexts enable a number of new abilities, especially + * "Execlists" (also implemented in this file). + * + * One of the main differences with the legacy HW contexts is that logical + * ring contexts incorporate many more things to the context's state, like + * PDPs or ringbuffer control registers: + * + * The reason why PDPs are included in the context is straightforward: as + * PPGTTs (per-process GTTs) are actually per-context, having the PDPs + * contained there mean you don't need to do a ppgtt->switch_mm yourself, + * instead, the GPU will do it for you on the context switch. + * + * But, what about the ringbuffer control registers (head, tail, etc..)? + * shouldn't we just need a set of those per engine command streamer? This is + * where the name "Logical Rings" starts to make sense: by virtualizing the + * rings, the engine cs shifts to a new "ring buffer" with every context + * switch. When you want to submit a workload to the GPU you: A) choose your + * context, B) find its appropriate virtualized ring, C) write commands to it + * and then, finally, D) tell the GPU to switch to that context. + * + * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch + * to a contexts is via a context execution list, ergo "Execlists". + * + * LRC implementation: + * Regarding the creation of contexts, we have: + * + * - One global default context. + * - One local default context for each opened fd. + * - One local extra context for each context create ioctl call. + * + * Now that ringbuffers belong per-context (and not per-engine, like before) + * and that contexts are uniquely tied to a given engine (and not reusable, + * like before) we need: + * + * - One ringbuffer per-engine inside each context. + * - One backing object per-engine inside each context. + * + * The global default context starts its life with these new objects fully + * allocated and populated. The local default context for each opened fd is + * more complex, because we don't know at creation time which engine is going + * to use them. To handle this, we have implemented a deferred creation of LR + * contexts: + * + * The local context starts its life as a hollow or blank holder, that only + * gets populated for a given engine once we receive an execbuffer. If later + * on we receive another execbuffer ioctl for the same context but a different + * engine, we allocate/populate a new ringbuffer and context backing object and + * so on. + * + * Finally, regarding local contexts created using the ioctl call: as they are + * only allowed with the render ring, we can allocate & populate them right + * away (no need to defer anything, at least for now). + * + * Execlists implementation: + * Execlists are the new method by which, on gen8+ hardware, workloads are + * submitted for execution (as opposed to the legacy, ringbuffer-based, method). + * This method works as follows: + * + * When a request is committed, its commands (the BB start and any leading or + * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer + * for the appropriate context. The tail pointer in the hardware context is not + * updated at this time, but instead, kept by the driver in the ringbuffer + * structure. A structure representing this request is added to a request queue + * for the appropriate engine: this structure contains a copy of the context's + * tail after the request was written to the ring buffer and a pointer to the + * context itself. + * + * If the engine's request queue was empty before the request was added, the + * queue is processed immediately. Otherwise the queue will be processed during + * a context switch interrupt. In any case, elements on the queue will get sent + * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a + * globally unique 20-bits submission ID. + * + * When execution of a request completes, the GPU updates the context status + * buffer with a context complete event and generates a context switch interrupt. + * During the interrupt handling, the driver examines the events in the buffer: + * for each context complete event, if the announced ID matches that on the head + * of the request queue, then that request is retired and removed from the queue. + * + * After processing, if any requests were retired and the queue is not empty + * then a new execution list can be submitted. The two requests at the front of + * the queue are next to be submitted but since a context may not occur twice in + * an execution list, if subsequent requests have the same ID as the first then + * the two requests must be combined. This is done simply by discarding requests + * at the head of the queue until either only one requests is left (in which case + * we use a NULL second context) or the first two requests have unique IDs. + * + * By always executing the first two requests in the queue the driver ensures + * that the GPU is kept as busy as possible. In the case where a single context + * completes but a second context is still executing, the request for this second + * context will be at the head of the queue when we remove the first one. This + * request will then be resubmitted along with a new request for a different context, + * which will cause the hardware to continue executing the second request and queue + * the new request (the GPU detects the condition of a context getting preempted + * with the same context and optimizes the context switch flow by not doing + * preemption, but just sampling the new tail pointer). + * + */ +#include <linux/interrupt.h> + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" +#include "i915_gem_render_state.h" +#include "i915_vgpu.h" +#include "intel_engine_pm.h" +#include "intel_lrc_reg.h" +#include "intel_mocs.h" +#include "intel_reset.h" +#include "intel_workarounds.h" + +#define RING_EXECLIST_QFULL (1 << 0x2) +#define RING_EXECLIST1_VALID (1 << 0x3) +#define RING_EXECLIST0_VALID (1 << 0x4) +#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE) +#define RING_EXECLIST1_ACTIVE (1 << 0x11) +#define RING_EXECLIST0_ACTIVE (1 << 0x12) + +#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0) +#define GEN8_CTX_STATUS_PREEMPTED (1 << 1) +#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2) +#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3) +#define GEN8_CTX_STATUS_COMPLETE (1 << 4) +#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15) + +#define GEN8_CTX_STATUS_COMPLETED_MASK \ + (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) + +/* Typical size of the average request (2 pipecontrols and a MI_BB) */ +#define EXECLISTS_REQUEST_SIZE 64 /* bytes */ +#define WA_TAIL_DWORDS 2 +#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) + +struct virtual_engine { + struct intel_engine_cs base; + struct intel_context context; + + /* + * We allow only a single request through the virtual engine at a time + * (each request in the timeline waits for the completion fence of + * the previous before being submitted). By restricting ourselves to + * only submitting a single request, each request is placed on to a + * physical to maximise load spreading (by virtue of the late greedy + * scheduling -- each real engine takes the next available request + * upon idling). + */ + struct i915_request *request; + + /* + * We keep a rbtree of available virtual engines inside each physical + * engine, sorted by priority. Here we preallocate the nodes we need + * for the virtual engine, indexed by physical_engine->id. + */ + struct ve_node { + struct rb_node rb; + int prio; + } nodes[I915_NUM_ENGINES]; + + /* + * Keep track of bonded pairs -- restrictions upon on our selection + * of physical engines any particular request may be submitted to. + * If we receive a submit-fence from a master engine, we will only + * use one of sibling_mask physical engines. + */ + struct ve_bond { + const struct intel_engine_cs *master; + intel_engine_mask_t sibling_mask; + } *bonds; + unsigned int num_bonds; + + /* And finally, which physical engines this virtual engine maps onto. */ + unsigned int num_siblings; + struct intel_engine_cs *siblings[0]; +}; + +static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) +{ + GEM_BUG_ON(!intel_engine_is_virtual(engine)); + return container_of(engine, struct virtual_engine, base); +} + +static int execlists_context_deferred_alloc(struct intel_context *ce, + struct intel_engine_cs *engine); +static void execlists_init_reg_state(u32 *reg_state, + struct intel_context *ce, + struct intel_engine_cs *engine, + struct intel_ring *ring); + +static inline struct i915_priolist *to_priolist(struct rb_node *rb) +{ + return rb_entry(rb, struct i915_priolist, node); +} + +static inline int rq_prio(const struct i915_request *rq) +{ + return rq->sched.attr.priority; +} + +static int effective_prio(const struct i915_request *rq) +{ + int prio = rq_prio(rq); + + /* + * On unwinding the active request, we give it a priority bump + * if it has completed waiting on any semaphore. If we know that + * the request has already started, we can prevent an unwanted + * preempt-to-idle cycle by taking that into account now. + */ + if (__i915_request_has_started(rq)) + prio |= I915_PRIORITY_NOSEMAPHORE; + + /* Restrict mere WAIT boosts from triggering preemption */ + return prio | __NO_PREEMPTION; +} + +static int queue_prio(const struct intel_engine_execlists *execlists) +{ + struct i915_priolist *p; + struct rb_node *rb; + + rb = rb_first_cached(&execlists->queue); + if (!rb) + return INT_MIN; + + /* + * As the priolist[] are inverted, with the highest priority in [0], + * we have to flip the index value to become priority. + */ + p = to_priolist(rb); + return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used); +} + +static inline bool need_preempt(const struct intel_engine_cs *engine, + const struct i915_request *rq, + struct rb_node *rb) +{ + int last_prio; + + if (!engine->preempt_context) + return false; + + if (i915_request_completed(rq)) + return false; + + /* + * Check if the current priority hint merits a preemption attempt. + * + * We record the highest value priority we saw during rescheduling + * prior to this dequeue, therefore we know that if it is strictly + * less than the current tail of ESLP[0], we do not need to force + * a preempt-to-idle cycle. + * + * However, the priority hint is a mere hint that we may need to + * preempt. If that hint is stale or we may be trying to preempt + * ourselves, ignore the request. + */ + last_prio = effective_prio(rq); + if (!i915_scheduler_need_preempt(engine->execlists.queue_priority_hint, + last_prio)) + return false; + + /* + * Check against the first request in ELSP[1], it will, thanks to the + * power of PI, be the highest priority of that context. + */ + if (!list_is_last(&rq->sched.link, &engine->active.requests) && + rq_prio(list_next_entry(rq, sched.link)) > last_prio) + return true; + + if (rb) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + bool preempt = false; + + if (engine == ve->siblings[0]) { /* only preempt one sibling */ + struct i915_request *next; + + rcu_read_lock(); + next = READ_ONCE(ve->request); + if (next) + preempt = rq_prio(next) > last_prio; + rcu_read_unlock(); + } + + if (preempt) + return preempt; + } + + /* + * If the inflight context did not trigger the preemption, then maybe + * it was the set of queued requests? Pick the highest priority in + * the queue (the first active priolist) and see if it deserves to be + * running instead of ELSP[0]. + * + * The highest priority request in the queue can not be either + * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same + * context, it's priority would not exceed ELSP[0] aka last_prio. + */ + return queue_prio(&engine->execlists) > last_prio; +} + +__maybe_unused static inline bool +assert_priority_queue(const struct i915_request *prev, + const struct i915_request *next) +{ + const struct intel_engine_execlists *execlists = + &prev->engine->execlists; + + /* + * Without preemption, the prev may refer to the still active element + * which we refuse to let go. + * + * Even with preemption, there are times when we think it is better not + * to preempt and leave an ostensibly lower priority request in flight. + */ + if (port_request(execlists->port) == prev) + return true; + + return rq_prio(prev) >= rq_prio(next); +} + +/* + * The context descriptor encodes various attributes of a context, + * including its GTT address and some flags. Because it's fairly + * expensive to calculate, we'll just do it once and cache the result, + * which remains valid until the context is unpinned. + * + * This is what a descriptor looks like, from LSB to MSB:: + * + * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) + * bits 12-31: LRCA, GTT address of (the HWSP of) this context + * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) + * bits 53-54: mbz, reserved for use by hardware + * bits 55-63: group ID, currently unused and set to 0 + * + * Starting from Gen11, the upper dword of the descriptor has a new format: + * + * bits 32-36: reserved + * bits 37-47: SW context ID + * bits 48:53: engine instance + * bit 54: mbz, reserved for use by hardware + * bits 55-60: SW counter + * bits 61-63: engine class + * + * engine info, SW context ID and SW counter need to form a unique number + * (Context ID) per lrc. + */ +static u64 +lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine) +{ + struct i915_gem_context *ctx = ce->gem_context; + u64 desc; + + BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (BIT(GEN8_CTX_ID_WIDTH))); + BUILD_BUG_ON(GEN11_MAX_CONTEXT_HW_ID > (BIT(GEN11_SW_CTX_ID_WIDTH))); + + desc = ctx->desc_template; /* bits 0-11 */ + GEM_BUG_ON(desc & GENMASK_ULL(63, 12)); + + desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE; + /* bits 12-31 */ + GEM_BUG_ON(desc & GENMASK_ULL(63, 32)); + + /* + * The following 32bits are copied into the OA reports (dword 2). + * Consider updating oa_get_render_ctx_id in i915_perf.c when changing + * anything below. + */ + if (INTEL_GEN(engine->i915) >= 11) { + GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH)); + desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT; + /* bits 37-47 */ + + desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT; + /* bits 48-53 */ + + /* TODO: decide what to do with SW counter (bits 55-60) */ + + desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT; + /* bits 61-63 */ + } else { + GEM_BUG_ON(ctx->hw_id >= BIT(GEN8_CTX_ID_WIDTH)); + desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ + } + + return desc; +} + +static void unwind_wa_tail(struct i915_request *rq) +{ + rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); + assert_ring_tail_valid(rq->ring, rq->tail); +} + +static struct i915_request * +__unwind_incomplete_requests(struct intel_engine_cs *engine) +{ + struct i915_request *rq, *rn, *active = NULL; + struct list_head *uninitialized_var(pl); + int prio = I915_PRIORITY_INVALID; + + lockdep_assert_held(&engine->active.lock); + + list_for_each_entry_safe_reverse(rq, rn, + &engine->active.requests, + sched.link) { + struct intel_engine_cs *owner; + + if (i915_request_completed(rq)) + break; + + __i915_request_unsubmit(rq); + unwind_wa_tail(rq); + + GEM_BUG_ON(rq->hw_context->inflight); + + /* + * Push the request back into the queue for later resubmission. + * If this request is not native to this physical engine (i.e. + * it came from a virtual source), push it back onto the virtual + * engine so that it can be moved across onto another physical + * engine as load dictates. + */ + owner = rq->hw_context->engine; + if (likely(owner == engine)) { + GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID); + if (rq_prio(rq) != prio) { + prio = rq_prio(rq); + pl = i915_sched_lookup_priolist(engine, prio); + } + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + + list_move(&rq->sched.link, pl); + active = rq; + } else { + rq->engine = owner; + owner->submit_request(rq); + active = NULL; + } + } + + return active; +} + +struct i915_request * +execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists) +{ + struct intel_engine_cs *engine = + container_of(execlists, typeof(*engine), execlists); + + return __unwind_incomplete_requests(engine); +} + +static inline void +execlists_context_status_change(struct i915_request *rq, unsigned long status) +{ + /* + * Only used when GVT-g is enabled now. When GVT-g is disabled, + * The compiler should eliminate this function as dead-code. + */ + if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) + return; + + atomic_notifier_call_chain(&rq->engine->context_status_notifier, + status, rq); +} + +inline void +execlists_user_begin(struct intel_engine_execlists *execlists, + const struct execlist_port *port) +{ + execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER); +} + +inline void +execlists_user_end(struct intel_engine_execlists *execlists) +{ + execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER); +} + +static inline void +execlists_context_schedule_in(struct i915_request *rq) +{ + GEM_BUG_ON(rq->hw_context->inflight); + + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); + intel_engine_context_in(rq->engine); + rq->hw_context->inflight = rq->engine; +} + +static void kick_siblings(struct i915_request *rq) +{ + struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine); + struct i915_request *next = READ_ONCE(ve->request); + + if (next && next->execution_mask & ~rq->execution_mask) + tasklet_schedule(&ve->base.execlists.tasklet); +} + +static inline void +execlists_context_schedule_out(struct i915_request *rq, unsigned long status) +{ + rq->hw_context->inflight = NULL; + intel_engine_context_out(rq->engine); + execlists_context_status_change(rq, status); + trace_i915_request_out(rq); + + /* + * If this is part of a virtual engine, its next request may have + * been blocked waiting for access to the active context. We have + * to kick all the siblings again in case we need to switch (e.g. + * the next request is not runnable on this engine). Hopefully, + * we will already have submitted the next request before the + * tasklet runs and do not need to rebuild each virtual tree + * and kick everyone again. + */ + if (rq->engine != rq->hw_context->engine) + kick_siblings(rq); +} + +static u64 execlists_update_context(struct i915_request *rq) +{ + struct intel_context *ce = rq->hw_context; + + ce->lrc_reg_state[CTX_RING_TAIL + 1] = + intel_ring_set_tail(rq->ring, rq->tail); + + /* + * Make sure the context image is complete before we submit it to HW. + * + * Ostensibly, writes (including the WCB) should be flushed prior to + * an uncached write such as our mmio register access, the empirical + * evidence (esp. on Braswell) suggests that the WC write into memory + * may not be visible to the HW prior to the completion of the UC + * register write and that we may begin execution from the context + * before its image is complete leading to invalid PD chasing. + * + * Furthermore, Braswell, at least, wants a full mb to be sure that + * the writes are coherent in memory (visible to the GPU) prior to + * execution, and not just visible to other CPUs (as is the result of + * wmb). + */ + mb(); + return ce->lrc_desc; +} + +static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) +{ + if (execlists->ctrl_reg) { + writel(lower_32_bits(desc), execlists->submit_reg + port * 2); + writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1); + } else { + writel(upper_32_bits(desc), execlists->submit_reg); + writel(lower_32_bits(desc), execlists->submit_reg); + } +} + +static void execlists_submit_ports(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + struct execlist_port *port = execlists->port; + unsigned int n; + + /* + * We can skip acquiring intel_runtime_pm_get() here as it was taken + * on our behalf by the request (see i915_gem_mark_busy()) and it will + * not be relinquished until the device is idle (see + * i915_gem_idle_work_handler()). As a precaution, we make sure + * that all ELSP are drained i.e. we have processed the CSB, + * before allowing ourselves to idle and calling intel_runtime_pm_put(). + */ + GEM_BUG_ON(!intel_wakeref_active(&engine->wakeref)); + + /* + * ELSQ note: the submit queue is not cleared after being submitted + * to the HW so we need to make sure we always clean it up. This is + * currently ensured by the fact that we always write the same number + * of elsq entries, keep this in mind before changing the loop below. + */ + for (n = execlists_num_ports(execlists); n--; ) { + struct i915_request *rq; + unsigned int count; + u64 desc; + + rq = port_unpack(&port[n], &count); + if (rq) { + GEM_BUG_ON(count > !n); + if (!count++) + execlists_context_schedule_in(rq); + port_set(&port[n], port_pack(rq, count)); + desc = execlists_update_context(rq); + GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc)); + + GEM_TRACE("%s in[%d]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n", + engine->name, n, + port[n].context_id, count, + rq->fence.context, rq->fence.seqno, + hwsp_seqno(rq), + rq_prio(rq)); + } else { + GEM_BUG_ON(!n); + desc = 0; + } + + write_desc(execlists, desc, n); + } + + /* we need to manually load the submit queue */ + if (execlists->ctrl_reg) + writel(EL_CTRL_LOAD, execlists->ctrl_reg); + + execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); +} + +static bool ctx_single_port_submission(const struct intel_context *ce) +{ + return (IS_ENABLED(CONFIG_DRM_I915_GVT) && + i915_gem_context_force_single_submission(ce->gem_context)); +} + +static bool can_merge_ctx(const struct intel_context *prev, + const struct intel_context *next) +{ + if (prev != next) + return false; + + if (ctx_single_port_submission(prev)) + return false; + + return true; +} + +static bool can_merge_rq(const struct i915_request *prev, + const struct i915_request *next) +{ + GEM_BUG_ON(!assert_priority_queue(prev, next)); + + if (!can_merge_ctx(prev->hw_context, next->hw_context)) + return false; + + return true; +} + +static void port_assign(struct execlist_port *port, struct i915_request *rq) +{ + GEM_BUG_ON(rq == port_request(port)); + + if (port_isset(port)) + i915_request_put(port_request(port)); + + port_set(port, port_pack(i915_request_get(rq), port_count(port))); +} + +static void inject_preempt_context(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + struct intel_context *ce = engine->preempt_context; + unsigned int n; + + GEM_BUG_ON(execlists->preempt_complete_status != + upper_32_bits(ce->lrc_desc)); + + /* + * Switch to our empty preempt context so + * the state of the GPU is known (idle). + */ + GEM_TRACE("%s\n", engine->name); + for (n = execlists_num_ports(execlists); --n; ) + write_desc(execlists, 0, n); + + write_desc(execlists, ce->lrc_desc, n); + + /* we need to manually load the submit queue */ + if (execlists->ctrl_reg) + writel(EL_CTRL_LOAD, execlists->ctrl_reg); + + execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); + execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT); + + (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); +} + +static void complete_preempt_context(struct intel_engine_execlists *execlists) +{ + GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)); + + if (inject_preempt_hang(execlists)) + return; + + execlists_cancel_port_requests(execlists); + __unwind_incomplete_requests(container_of(execlists, + struct intel_engine_cs, + execlists)); +} + +static void virtual_update_register_offsets(u32 *regs, + struct intel_engine_cs *engine) +{ + u32 base = engine->mmio_base; + + /* Must match execlists_init_reg_state()! */ + + regs[CTX_CONTEXT_CONTROL] = + i915_mmio_reg_offset(RING_CONTEXT_CONTROL(base)); + regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base)); + regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base)); + regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base)); + regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base)); + + regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base)); + regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base)); + regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base)); + regs[CTX_SECOND_BB_HEAD_U] = + i915_mmio_reg_offset(RING_SBBADDR_UDW(base)); + regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base)); + regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base)); + + regs[CTX_CTX_TIMESTAMP] = + i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base)); + regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 3)); + regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 3)); + regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 2)); + regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 2)); + regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 1)); + regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 1)); + regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0)); + regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, 0)); + + if (engine->class == RENDER_CLASS) { + regs[CTX_RCS_INDIRECT_CTX] = + i915_mmio_reg_offset(RING_INDIRECT_CTX(base)); + regs[CTX_RCS_INDIRECT_CTX_OFFSET] = + i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base)); + regs[CTX_BB_PER_CTX_PTR] = + i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base)); + + regs[CTX_R_PWR_CLK_STATE] = + i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE); + } +} + +static bool virtual_matches(const struct virtual_engine *ve, + const struct i915_request *rq, + const struct intel_engine_cs *engine) +{ + const struct intel_engine_cs *inflight; + + if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */ + return false; + + /* + * We track when the HW has completed saving the context image + * (i.e. when we have seen the final CS event switching out of + * the context) and must not overwrite the context image before + * then. This restricts us to only using the active engine + * while the previous virtualized request is inflight (so + * we reuse the register offsets). This is a very small + * hystersis on the greedy seelction algorithm. + */ + inflight = READ_ONCE(ve->context.inflight); + if (inflight && inflight != engine) + return false; + + return true; +} + +static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, + struct intel_engine_cs *engine) +{ + struct intel_engine_cs *old = ve->siblings[0]; + + /* All unattached (rq->engine == old) must already be completed */ + + spin_lock(&old->breadcrumbs.irq_lock); + if (!list_empty(&ve->context.signal_link)) { + list_move_tail(&ve->context.signal_link, + &engine->breadcrumbs.signalers); + intel_engine_queue_breadcrumbs(engine); + } + spin_unlock(&old->breadcrumbs.irq_lock); +} + +static void execlists_dequeue(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct execlist_port *port = execlists->port; + const struct execlist_port * const last_port = + &execlists->port[execlists->port_mask]; + struct i915_request *last = port_request(port); + struct rb_node *rb; + bool submit = false; + + /* + * Hardware submission is through 2 ports. Conceptually each port + * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is + * static for a context, and unique to each, so we only execute + * requests belonging to a single context from each ring. RING_HEAD + * is maintained by the CS in the context image, it marks the place + * where it got up to last time, and through RING_TAIL we tell the CS + * where we want to execute up to this time. + * + * In this list the requests are in order of execution. Consecutive + * requests from the same context are adjacent in the ringbuffer. We + * can combine these requests into a single RING_TAIL update: + * + * RING_HEAD...req1...req2 + * ^- RING_TAIL + * since to execute req2 the CS must first execute req1. + * + * Our goal then is to point each port to the end of a consecutive + * sequence of requests as being the most optimal (fewest wake ups + * and context switches) submission. + */ + + for (rb = rb_first_cached(&execlists->virtual); rb; ) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (!rq) { /* lazily cleanup after another engine handled rq */ + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + if (!virtual_matches(ve, rq, engine)) { + rb = rb_next(rb); + continue; + } + + break; + } + + if (last) { + /* + * Don't resubmit or switch until all outstanding + * preemptions (lite-restore) are seen. Then we + * know the next preemption status we see corresponds + * to this ELSP update. + */ + GEM_BUG_ON(!execlists_is_active(execlists, + EXECLISTS_ACTIVE_USER)); + GEM_BUG_ON(!port_count(&port[0])); + + /* + * If we write to ELSP a second time before the HW has had + * a chance to respond to the previous write, we can confuse + * the HW and hit "undefined behaviour". After writing to ELSP, + * we must then wait until we see a context-switch event from + * the HW to indicate that it has had a chance to respond. + */ + if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK)) + return; + + if (need_preempt(engine, last, rb)) { + inject_preempt_context(engine); + return; + } + + /* + * In theory, we could coalesce more requests onto + * the second port (the first port is active, with + * no preemptions pending). However, that means we + * then have to deal with the possible lite-restore + * of the second port (as we submit the ELSP, there + * may be a context-switch) but also we may complete + * the resubmission before the context-switch. Ergo, + * coalescing onto the second port will cause a + * preemption event, but we cannot predict whether + * that will affect port[0] or port[1]. + * + * If the second port is already active, we can wait + * until the next context-switch before contemplating + * new requests. The GPU will be busy and we should be + * able to resubmit the new ELSP before it idles, + * avoiding pipeline bubbles (momentary pauses where + * the driver is unable to keep up the supply of new + * work). However, we have to double check that the + * priorities of the ports haven't been switch. + */ + if (port_count(&port[1])) + return; + + /* + * WaIdleLiteRestore:bdw,skl + * Apply the wa NOOPs to prevent + * ring:HEAD == rq:TAIL as we resubmit the + * request. See gen8_emit_fini_breadcrumb() for + * where we prepare the padding after the + * end of the request. + */ + last->tail = last->wa_tail; + } + + while (rb) { /* XXX virtual is always taking precedence */ + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq; + + spin_lock(&ve->base.active.lock); + + rq = ve->request; + if (unlikely(!rq)) { /* lost the race to a sibling */ + spin_unlock(&ve->base.active.lock); + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + rb = rb_first_cached(&execlists->virtual); + continue; + } + + GEM_BUG_ON(rq != ve->request); + GEM_BUG_ON(rq->engine != &ve->base); + GEM_BUG_ON(rq->hw_context != &ve->context); + + if (rq_prio(rq) >= queue_prio(execlists)) { + if (!virtual_matches(ve, rq, engine)) { + spin_unlock(&ve->base.active.lock); + rb = rb_next(rb); + continue; + } + + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.active.lock); + return; /* leave this rq for another engine */ + } + + GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", + engine->name, + rq->fence.context, + rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + yesno(engine != ve->siblings[0])); + + ve->request = NULL; + ve->base.execlists.queue_priority_hint = INT_MIN; + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + GEM_BUG_ON(!(rq->execution_mask & engine->mask)); + rq->engine = engine; + + if (engine != ve->siblings[0]) { + u32 *regs = ve->context.lrc_reg_state; + unsigned int n; + + GEM_BUG_ON(READ_ONCE(ve->context.inflight)); + virtual_update_register_offsets(regs, engine); + + if (!list_empty(&ve->context.signals)) + virtual_xfer_breadcrumbs(ve, engine); + + /* + * Move the bound engine to the top of the list + * for future execution. We then kick this + * tasklet first before checking others, so that + * we preferentially reuse this set of bound + * registers. + */ + for (n = 1; n < ve->num_siblings; n++) { + if (ve->siblings[n] == engine) { + swap(ve->siblings[n], + ve->siblings[0]); + break; + } + } + + GEM_BUG_ON(ve->siblings[0] != engine); + } + + __i915_request_submit(rq); + trace_i915_request_in(rq, port_index(port, execlists)); + submit = true; + last = rq; + } + + spin_unlock(&ve->base.active.lock); + break; + } + + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + struct i915_request *rq, *rn; + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + /* + * Can we combine this request with the current port? + * It has to be the same context/ringbuffer and not + * have any exceptions (e.g. GVT saying never to + * combine contexts). + * + * If we can combine the requests, we can execute both + * by updating the RING_TAIL to point to the end of the + * second request, and so we never need to tell the + * hardware about the first. + */ + if (last && !can_merge_rq(last, rq)) { + /* + * If we are on the second port and cannot + * combine this request with the last, then we + * are done. + */ + if (port == last_port) + goto done; + + /* + * We must not populate both ELSP[] with the + * same LRCA, i.e. we must submit 2 different + * contexts if we submit 2 ELSP. + */ + if (last->hw_context == rq->hw_context) + goto done; + + /* + * If GVT overrides us we only ever submit + * port[0], leaving port[1] empty. Note that we + * also have to be careful that we don't queue + * the same context (even though a different + * request) to the second port. + */ + if (ctx_single_port_submission(last->hw_context) || + ctx_single_port_submission(rq->hw_context)) + goto done; + + + if (submit) + port_assign(port, last); + port++; + + GEM_BUG_ON(port_isset(port)); + } + + __i915_request_submit(rq); + trace_i915_request_in(rq, port_index(port, execlists)); + + last = rq; + submit = true; + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } + +done: + /* + * Here be a bit of magic! Or sleight-of-hand, whichever you prefer. + * + * We choose the priority hint such that if we add a request of greater + * priority than this, we kick the submission tasklet to decide on + * the right order of submitting the requests to hardware. We must + * also be prepared to reorder requests as they are in-flight on the + * HW. We derive the priority hint then as the first "hole" in + * the HW submission ports and if there are no available slots, + * the priority of the lowest executing request, i.e. last. + * + * When we do receive a higher priority request ready to run from the + * user, see queue_request(), the priority hint is bumped to that + * request triggering preemption on the next dequeue (or subsequent + * interrupt for secondary ports). + */ + execlists->queue_priority_hint = queue_prio(execlists); + + if (submit) { + port_assign(port, last); + execlists_submit_ports(engine); + } + + /* We must always keep the beast fed if we have work piled up */ + GEM_BUG_ON(rb_first_cached(&execlists->queue) && + !port_isset(execlists->port)); + + /* Re-evaluate the executing context setup after each preemptive kick */ + if (last) + execlists_user_begin(execlists, execlists->port); + + /* If the engine is now idle, so should be the flag; and vice versa. */ + GEM_BUG_ON(execlists_is_active(&engine->execlists, + EXECLISTS_ACTIVE_USER) == + !port_isset(engine->execlists.port)); +} + +void +execlists_cancel_port_requests(struct intel_engine_execlists * const execlists) +{ + struct execlist_port *port = execlists->port; + unsigned int num_ports = execlists_num_ports(execlists); + + while (num_ports-- && port_isset(port)) { + struct i915_request *rq = port_request(port); + + GEM_TRACE("%s:port%u fence %llx:%lld, (current %d)\n", + rq->engine->name, + (unsigned int)(port - execlists->port), + rq->fence.context, rq->fence.seqno, + hwsp_seqno(rq)); + + GEM_BUG_ON(!execlists->active); + execlists_context_schedule_out(rq, + i915_request_completed(rq) ? + INTEL_CONTEXT_SCHEDULE_OUT : + INTEL_CONTEXT_SCHEDULE_PREEMPTED); + + i915_request_put(rq); + + memset(port, 0, sizeof(*port)); + port++; + } + + execlists_clear_all_active(execlists); +} + +static inline void +invalidate_csb_entries(const u32 *first, const u32 *last) +{ + clflush((void *)first); + clflush((void *)last); +} + +static inline bool +reset_in_progress(const struct intel_engine_execlists *execlists) +{ + return unlikely(!__tasklet_is_enabled(&execlists->tasklet)); +} + +static void process_csb(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct execlist_port *port = execlists->port; + const u32 * const buf = execlists->csb_status; + const u8 num_entries = execlists->csb_size; + u8 head, tail; + + lockdep_assert_held(&engine->active.lock); + GEM_BUG_ON(USES_GUC_SUBMISSION(engine->i915)); + + /* + * Note that csb_write, csb_status may be either in HWSP or mmio. + * When reading from the csb_write mmio register, we have to be + * careful to only use the GEN8_CSB_WRITE_PTR portion, which is + * the low 4bits. As it happens we know the next 4bits are always + * zero and so we can simply masked off the low u8 of the register + * and treat it identically to reading from the HWSP (without having + * to use explicit shifting and masking, and probably bifurcating + * the code to handle the legacy mmio read). + */ + head = execlists->csb_head; + tail = READ_ONCE(*execlists->csb_write); + GEM_TRACE("%s cs-irq head=%d, tail=%d\n", engine->name, head, tail); + if (unlikely(head == tail)) + return; + + /* + * Hopefully paired with a wmb() in HW! + * + * We must complete the read of the write pointer before any reads + * from the CSB, so that we do not see stale values. Without an rmb + * (lfence) the HW may speculatively perform the CSB[] reads *before* + * we perform the READ_ONCE(*csb_write). + */ + rmb(); + + do { + struct i915_request *rq; + unsigned int status; + unsigned int count; + + if (++head == num_entries) + head = 0; + + /* + * We are flying near dragons again. + * + * We hold a reference to the request in execlist_port[] + * but no more than that. We are operating in softirq + * context and so cannot hold any mutex or sleep. That + * prevents us stopping the requests we are processing + * in port[] from being retired simultaneously (the + * breadcrumb will be complete before we see the + * context-switch). As we only hold the reference to the + * request, any pointer chasing underneath the request + * is subject to a potential use-after-free. Thus we + * store all of the bookkeeping within port[] as + * required, and avoid using unguarded pointers beneath + * request itself. The same applies to the atomic + * status notifier. + */ + + GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n", + engine->name, head, + buf[2 * head + 0], buf[2 * head + 1], + execlists->active); + + status = buf[2 * head]; + if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE | + GEN8_CTX_STATUS_PREEMPTED)) + execlists_set_active(execlists, + EXECLISTS_ACTIVE_HWACK); + if (status & GEN8_CTX_STATUS_ACTIVE_IDLE) + execlists_clear_active(execlists, + EXECLISTS_ACTIVE_HWACK); + + if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) + continue; + + /* We should never get a COMPLETED | IDLE_ACTIVE! */ + GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE); + + if (status & GEN8_CTX_STATUS_COMPLETE && + buf[2*head + 1] == execlists->preempt_complete_status) { + GEM_TRACE("%s preempt-idle\n", engine->name); + complete_preempt_context(execlists); + continue; + } + + if (status & GEN8_CTX_STATUS_PREEMPTED && + execlists_is_active(execlists, + EXECLISTS_ACTIVE_PREEMPT)) + continue; + + GEM_BUG_ON(!execlists_is_active(execlists, + EXECLISTS_ACTIVE_USER)); + + rq = port_unpack(port, &count); + GEM_TRACE("%s out[0]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n", + engine->name, + port->context_id, count, + rq ? rq->fence.context : 0, + rq ? rq->fence.seqno : 0, + rq ? hwsp_seqno(rq) : 0, + rq ? rq_prio(rq) : 0); + + /* Check the context/desc id for this event matches */ + GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); + + GEM_BUG_ON(count == 0); + if (--count == 0) { + /* + * On the final event corresponding to the + * submission of this context, we expect either + * an element-switch event or a completion + * event (and on completion, the active-idle + * marker). No more preemptions, lite-restore + * or otherwise. + */ + GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); + GEM_BUG_ON(port_isset(&port[1]) && + !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH)); + GEM_BUG_ON(!port_isset(&port[1]) && + !(status & GEN8_CTX_STATUS_ACTIVE_IDLE)); + + /* + * We rely on the hardware being strongly + * ordered, that the breadcrumb write is + * coherent (visible from the CPU) before the + * user interrupt and CSB is processed. + */ + GEM_BUG_ON(!i915_request_completed(rq)); + + execlists_context_schedule_out(rq, + INTEL_CONTEXT_SCHEDULE_OUT); + i915_request_put(rq); + + GEM_TRACE("%s completed ctx=%d\n", + engine->name, port->context_id); + + port = execlists_port_complete(execlists, port); + if (port_isset(port)) + execlists_user_begin(execlists, port); + else + execlists_user_end(execlists); + } else { + port_set(port, port_pack(rq, count)); + } + } while (head != tail); + + execlists->csb_head = head; + + /* + * Gen11 has proven to fail wrt global observation point between + * entry and tail update, failing on the ordering and thus + * we see an old entry in the context status buffer. + * + * Forcibly evict out entries for the next gpu csb update, + * to increase the odds that we get a fresh entries with non + * working hardware. The cost for doing so comes out mostly with + * the wash as hardware, working or not, will need to do the + * invalidation before. + */ + invalidate_csb_entries(&buf[0], &buf[num_entries - 1]); +} + +static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) +{ + lockdep_assert_held(&engine->active.lock); + + process_csb(engine); + if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT)) + execlists_dequeue(engine); +} + +/* + * Check the unread Context Status Buffers and manage the submission of new + * contexts to the ELSP accordingly. + */ +static void execlists_submission_tasklet(unsigned long data) +{ + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; + unsigned long flags; + + GEM_TRACE("%s awake?=%d, active=%x\n", + engine->name, + !!intel_wakeref_active(&engine->wakeref), + engine->execlists.active); + + spin_lock_irqsave(&engine->active.lock, flags); + __execlists_submission_tasklet(engine); + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void queue_request(struct intel_engine_cs *engine, + struct i915_sched_node *node, + int prio) +{ + GEM_BUG_ON(!list_empty(&node->link)); + list_add_tail(&node->link, i915_sched_lookup_priolist(engine, prio)); +} + +static void __submit_queue_imm(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + if (reset_in_progress(execlists)) + return; /* defer until we restart the engine following reset */ + + if (execlists->tasklet.func == execlists_submission_tasklet) + __execlists_submission_tasklet(engine); + else + tasklet_hi_schedule(&execlists->tasklet); +} + +static void submit_queue(struct intel_engine_cs *engine, int prio) +{ + if (prio > engine->execlists.queue_priority_hint) { + engine->execlists.queue_priority_hint = prio; + __submit_queue_imm(engine); + } +} + +static void execlists_submit_request(struct i915_request *request) +{ + struct intel_engine_cs *engine = request->engine; + unsigned long flags; + + /* Will be called from irq-context when using foreign fences. */ + spin_lock_irqsave(&engine->active.lock, flags); + + queue_request(engine, &request->sched, rq_prio(request)); + + GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); + GEM_BUG_ON(list_empty(&request->sched.link)); + + submit_queue(engine, rq_prio(request)); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void __execlists_context_fini(struct intel_context *ce) +{ + intel_ring_put(ce->ring); + + GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj)); + i915_gem_object_put(ce->state->obj); +} + +static void execlists_context_destroy(struct kref *kref) +{ + struct intel_context *ce = container_of(kref, typeof(*ce), ref); + + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (ce->state) + __execlists_context_fini(ce); + + intel_context_free(ce); +} + +static void execlists_context_unpin(struct intel_context *ce) +{ + i915_gem_context_unpin_hw_id(ce->gem_context); + i915_gem_object_unpin_map(ce->state->obj); + intel_ring_unpin(ce->ring); +} + +static void +__execlists_update_reg_state(struct intel_context *ce, + struct intel_engine_cs *engine) +{ + struct intel_ring *ring = ce->ring; + u32 *regs = ce->lrc_reg_state; + + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); + + regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(ring->vma); + regs[CTX_RING_HEAD + 1] = ring->head; + regs[CTX_RING_TAIL + 1] = ring->tail; + + /* RPCS */ + if (engine->class == RENDER_CLASS) + regs[CTX_R_PWR_CLK_STATE + 1] = + intel_sseu_make_rpcs(engine->i915, &ce->sseu); +} + +static int +__execlists_context_pin(struct intel_context *ce, + struct intel_engine_cs *engine) +{ + void *vaddr; + int ret; + + GEM_BUG_ON(!ce->gem_context->vm); + + ret = execlists_context_deferred_alloc(ce, engine); + if (ret) + goto err; + GEM_BUG_ON(!ce->state); + + ret = intel_context_active_acquire(ce, + engine->i915->ggtt.pin_bias | + PIN_OFFSET_BIAS | + PIN_HIGH); + if (ret) + goto err; + + vaddr = i915_gem_object_pin_map(ce->state->obj, + i915_coherent_map_type(engine->i915) | + I915_MAP_OVERRIDE); + if (IS_ERR(vaddr)) { + ret = PTR_ERR(vaddr); + goto unpin_active; + } + + ret = intel_ring_pin(ce->ring); + if (ret) + goto unpin_map; + + ret = i915_gem_context_pin_hw_id(ce->gem_context); + if (ret) + goto unpin_ring; + + ce->lrc_desc = lrc_descriptor(ce, engine); + ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; + __execlists_update_reg_state(ce, engine); + + return 0; + +unpin_ring: + intel_ring_unpin(ce->ring); +unpin_map: + i915_gem_object_unpin_map(ce->state->obj); +unpin_active: + intel_context_active_release(ce); +err: + return ret; +} + +static int execlists_context_pin(struct intel_context *ce) +{ + return __execlists_context_pin(ce, ce->engine); +} + +static void execlists_context_reset(struct intel_context *ce) +{ + /* + * Because we emit WA_TAIL_DWORDS there may be a disparity + * between our bookkeeping in ce->ring->head and ce->ring->tail and + * that stored in context. As we only write new commands from + * ce->ring->tail onwards, everything before that is junk. If the GPU + * starts reading from its RING_HEAD from the context, it may try to + * execute that junk and die. + * + * The contexts that are stilled pinned on resume belong to the + * kernel, and are local to each engine. All other contexts will + * have their head/tail sanitized upon pinning before use, so they + * will never see garbage, + * + * So to avoid that we reset the context images upon resume. For + * simplicity, we just zero everything out. + */ + intel_ring_reset(ce->ring, 0); + __execlists_update_reg_state(ce, ce->engine); +} + +static const struct intel_context_ops execlists_context_ops = { + .pin = execlists_context_pin, + .unpin = execlists_context_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .reset = execlists_context_reset, + .destroy = execlists_context_destroy, +}; + +static int gen8_emit_init_breadcrumb(struct i915_request *rq) +{ + u32 *cs; + + GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb); + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Check if we have been preempted before we even get started. + * + * After this point i915_request_started() reports true, even if + * we get preempted and so are no longer running. + */ + *cs++ = MI_ARB_CHECK; + *cs++ = MI_NOOP; + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = rq->timeline->hwsp_offset; + *cs++ = 0; + *cs++ = rq->fence.seqno - 1; + + intel_ring_advance(rq, cs); + + /* Record the updated position of the request's payload */ + rq->infix = intel_ring_offset(rq, cs); + + return 0; +} + +static int emit_pdps(struct i915_request *rq) +{ + const struct intel_engine_cs * const engine = rq->engine; + struct i915_ppgtt * const ppgtt = + i915_vm_to_ppgtt(rq->gem_context->vm); + int err, i; + u32 *cs; + + GEM_BUG_ON(intel_vgpu_active(rq->i915)); + + /* + * Beware ye of the dragons, this sequence is magic! + * + * Small changes to this sequence can cause anything from + * GPU hangs to forcewake errors and machine lockups! + */ + + /* Flush any residual operations from the context load */ + err = engine->emit_flush(rq, EMIT_FLUSH); + if (err) + return err; + + /* Magic required to prevent forcewake errors! */ + err = engine->emit_flush(rq, EMIT_INVALIDATE); + if (err) + return err; + + cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Ensure the LRI have landed before we invalidate & continue */ + *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; + for (i = GEN8_3LVL_PDPES; i--; ) { + const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); + u32 base = engine->mmio_base; + + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); + *cs++ = upper_32_bits(pd_daddr); + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); + *cs++ = lower_32_bits(pd_daddr); + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + /* Be doubly sure the LRI have landed before proceeding */ + err = engine->emit_flush(rq, EMIT_FLUSH); + if (err) + return err; + + /* Re-invalidate the TLB for luck */ + return engine->emit_flush(rq, EMIT_INVALIDATE); +} + +static int execlists_request_alloc(struct i915_request *request) +{ + int ret; + + GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); + + /* + * Flush enough space to reduce the likelihood of waiting after + * we start building the request - in which case we will just + * have to repeat work. + */ + request->reserved_space += EXECLISTS_REQUEST_SIZE; + + /* + * Note that after this point, we have committed to using + * this request as it is being used to both track the + * state of engine initialisation and liveness of the + * golden renderstate above. Think twice before you try + * to cancel/unwind this request now. + */ + + /* Unconditionally invalidate GPU caches and TLBs. */ + if (i915_vm_is_4lvl(request->gem_context->vm)) + ret = request->engine->emit_flush(request, EMIT_INVALIDATE); + else + ret = emit_pdps(request); + if (ret) + return ret; + + request->reserved_space -= EXECLISTS_REQUEST_SIZE; + return 0; +} + +/* + * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after + * PIPE_CONTROL instruction. This is required for the flush to happen correctly + * but there is a slight complication as this is applied in WA batch where the + * values are only initialized once so we cannot take register value at the + * beginning and reuse it further; hence we save its value to memory, upload a + * constant value with bit21 set and then we restore it back with the saved value. + * To simplify the WA, a constant value is formed by using the default value + * of this register. This shouldn't be a problem because we are only modifying + * it for a short period and this batch in non-premptible. We can ofcourse + * use additional instructions that read the actual value of the register + * at that time and set our bit of interest but it makes the WA complicated. + * + * This WA is also required for Gen9 so extracting as a function avoids + * code duplication. + */ +static u32 * +gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) +{ + /* NB no one else is allowed to scribble over scratch + 256! */ + *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = i915_scratch_offset(engine->i915) + 256; + *batch++ = 0; + + *batch++ = MI_LOAD_REGISTER_IMM(1); + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; + + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); + + *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = i915_scratch_offset(engine->i915) + 256; + *batch++ = 0; + + return batch; +} + +/* + * Typically we only have one indirect_ctx and per_ctx batch buffer which are + * initialized at the beginning and shared across all contexts but this field + * helps us to have multiple batches at different offsets and select them based + * on a criteria. At the moment this batch always start at the beginning of the page + * and at this point we don't have multiple wa_ctx batch buffers. + * + * The number of WA applied are not known at the beginning; we use this field + * to return the no of DWORDS written. + * + * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END + * so it adds NOOPs as padding to make it cacheline aligned. + * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together + * makes a complete batch buffer. + */ +static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + /* WaDisableCtxRestoreArbitration:bdw,chv */ + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ + if (IS_BROADWELL(engine->i915)) + batch = gen8_emit_flush_coherentl3_wa(engine, batch); + + /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ + /* Actual scratch location is at 128 bytes offset */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + i915_scratch_offset(engine->i915) + + 2 * CACHELINE_BYTES); + + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + /* + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because + * execution depends on the length specified in terms of cache lines + * in the register CTX_RCS_INDIRECT_CTX + */ + + return batch; +} + +struct lri { + i915_reg_t reg; + u32 value; +}; + +static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count) +{ + GEM_BUG_ON(!count || count > 63); + + *batch++ = MI_LOAD_REGISTER_IMM(count); + do { + *batch++ = i915_mmio_reg_offset(lri->reg); + *batch++ = lri->value; + } while (lri++, --count); + *batch++ = MI_NOOP; + + return batch; +} + +static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + static const struct lri lri[] = { + /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ + { + COMMON_SLICE_CHICKEN2, + __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE, + 0), + }, + + /* BSpec: 11391 */ + { + FF_SLICE_CHICKEN, + __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX, + FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX), + }, + + /* BSpec: 11299 */ + { + _3D_CHICKEN3, + __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX, + _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX), + } + }; + + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ + batch = gen8_emit_flush_coherentl3_wa(engine, batch); + + batch = emit_lri(batch, lri, ARRAY_SIZE(lri)); + + /* WaMediaPoolStateCmdInWABB:bxt,glk */ + if (HAS_POOLED_EU(engine->i915)) { + /* + * EU pool configuration is setup along with golden context + * during context initialization. This value depends on + * device type (2x6 or 3x6) and needs to be updated based + * on which subslice is disabled especially for 2x6 + * devices, however it is safe to load default + * configuration of 3x6 device instead of masking off + * corresponding bits because HW ignores bits of a disabled + * subslice and drops down to appropriate config. Please + * see render_state_setup() in i915_gem_render_state.c for + * possible configurations, to avoid duplication they are + * not shown here again. + */ + *batch++ = GEN9_MEDIA_POOL_STATE; + *batch++ = GEN9_MEDIA_POOL_ENABLE; + *batch++ = 0x00777000; + *batch++ = 0; + *batch++ = 0; + *batch++ = 0; + } + + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + return batch; +} + +static u32 * +gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) +{ + int i; + + /* + * WaPipeControlBefore3DStateSamplePattern: cnl + * + * Ensure the engine is idle prior to programming a + * 3DSTATE_SAMPLE_PATTERN during a context restore. + */ + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_CS_STALL, + 0); + /* + * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for + * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in + * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is + * confusing. Since gen8_emit_pipe_control() already advances the + * batch by 6 dwords, we advance the other 10 here, completing a + * cacheline. It's not clear if the workaround requires this padding + * before other commands, or if it's just the regular padding we would + * already have for the workaround bb, so leave it here for now. + */ + for (i = 0; i < 10; i++) + *batch++ = MI_NOOP; + + /* Pad to end of cacheline */ + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; + + return batch; +} + +#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) + +static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE); + if (IS_ERR(obj)) + return PTR_ERR(obj); + + vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); + if (err) + goto err; + + engine->wa_ctx.vma = vma; + return 0; + +err: + i915_gem_object_put(obj); + return err; +} + +static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) +{ + i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0); +} + +typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); + +static int intel_init_workaround_bb(struct intel_engine_cs *engine) +{ + struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; + struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, + &wa_ctx->per_ctx }; + wa_bb_func_t wa_bb_fn[2]; + struct page *page; + void *batch, *batch_ptr; + unsigned int i; + int ret; + + if (engine->class != RENDER_CLASS) + return 0; + + switch (INTEL_GEN(engine->i915)) { + case 11: + return 0; + case 10: + wa_bb_fn[0] = gen10_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + case 9: + wa_bb_fn[0] = gen9_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + case 8: + wa_bb_fn[0] = gen8_init_indirectctx_bb; + wa_bb_fn[1] = NULL; + break; + default: + MISSING_CASE(INTEL_GEN(engine->i915)); + return 0; + } + + ret = lrc_setup_wa_ctx(engine); + if (ret) { + DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); + return ret; + } + + page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); + batch = batch_ptr = kmap_atomic(page); + + /* + * Emit the two workaround batch buffers, recording the offset from the + * start of the workaround batch buffer object for each and their + * respective sizes. + */ + for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { + wa_bb[i]->offset = batch_ptr - batch; + if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, + CACHELINE_BYTES))) { + ret = -EINVAL; + break; + } + if (wa_bb_fn[i]) + batch_ptr = wa_bb_fn[i](engine, batch_ptr); + wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); + } + + BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); + + kunmap_atomic(batch); + if (ret) + lrc_destroy_wa_ctx(engine); + + return ret; +} + +static void enable_execlists(struct intel_engine_cs *engine) +{ + intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */ + + if (INTEL_GEN(engine->i915) >= 11) + ENGINE_WRITE(engine, + RING_MODE_GEN7, + _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE)); + else + ENGINE_WRITE(engine, + RING_MODE_GEN7, + _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); + + ENGINE_WRITE(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); + + ENGINE_WRITE(engine, + RING_HWS_PGA, + i915_ggtt_offset(engine->status_page.vma)); + ENGINE_POSTING_READ(engine, RING_HWS_PGA); +} + +static bool unexpected_starting_state(struct intel_engine_cs *engine) +{ + bool unexpected = false; + + if (ENGINE_READ(engine, RING_MI_MODE) & STOP_RING) { + DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); + unexpected = true; + } + + return unexpected; +} + +static int execlists_resume(struct intel_engine_cs *engine) +{ + intel_engine_apply_workarounds(engine); + intel_engine_apply_whitelist(engine); + + intel_mocs_init_engine(engine); + + intel_engine_reset_breadcrumbs(engine); + + if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) { + struct drm_printer p = drm_debug_printer(__func__); + + intel_engine_dump(engine, &p, NULL); + } + + enable_execlists(engine); + + return 0; +} + +static void execlists_reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + unsigned long flags; + + GEM_TRACE("%s: depth<-%d\n", engine->name, + atomic_read(&execlists->tasklet.count)); + + /* + * Prevent request submission to the hardware until we have + * completed the reset in i915_gem_reset_finish(). If a request + * is completed by one engine, it may then queue a request + * to a second via its execlists->tasklet *just* as we are + * calling engine->resume() and also writing the ELSP. + * Turning off the execlists->tasklet until the reset is over + * prevents the race. + */ + __tasklet_disable_sync_once(&execlists->tasklet); + GEM_BUG_ON(!reset_in_progress(execlists)); + + intel_engine_stop_cs(engine); + + /* And flush any current direct submission. */ + spin_lock_irqsave(&engine->active.lock, flags); + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static bool lrc_regs_ok(const struct i915_request *rq) +{ + const struct intel_ring *ring = rq->ring; + const u32 *regs = rq->hw_context->lrc_reg_state; + + /* Quick spot check for the common signs of context corruption */ + + if (regs[CTX_RING_BUFFER_CONTROL + 1] != + (RING_CTL_SIZE(ring->size) | RING_VALID)) + return false; + + if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma)) + return false; + + return true; +} + +static void reset_csb_pointers(struct intel_engine_execlists *execlists) +{ + const unsigned int reset_value = execlists->csb_size - 1; + + /* + * After a reset, the HW starts writing into CSB entry [0]. We + * therefore have to set our HEAD pointer back one entry so that + * the *first* entry we check is entry 0. To complicate this further, + * as we don't wait for the first interrupt after reset, we have to + * fake the HW write to point back to the last entry so that our + * inline comparison of our cached head position against the last HW + * write works even before the first interrupt. + */ + execlists->csb_head = reset_value; + WRITE_ONCE(*execlists->csb_write, reset_value); + wmb(); /* Make sure this is visible to HW (paranoia?) */ + + invalidate_csb_entries(&execlists->csb_status[0], + &execlists->csb_status[reset_value]); +} + +static struct i915_request *active_request(struct i915_request *rq) +{ + const struct list_head * const list = &rq->engine->active.requests; + const struct intel_context * const context = rq->hw_context; + struct i915_request *active = NULL; + + list_for_each_entry_from_reverse(rq, list, sched.link) { + if (i915_request_completed(rq)) + break; + + if (rq->hw_context != context) + break; + + active = rq; + } + + return active; +} + +static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct intel_context *ce; + struct i915_request *rq; + u32 *regs; + + process_csb(engine); /* drain preemption events */ + + /* Following the reset, we need to reload the CSB read/write pointers */ + reset_csb_pointers(&engine->execlists); + + /* + * Save the currently executing context, even if we completed + * its request, it was still running at the time of the + * reset and will have been clobbered. + */ + if (!port_isset(execlists->port)) + goto out_clear; + + rq = port_request(execlists->port); + ce = rq->hw_context; + + /* + * Catch up with any missed context-switch interrupts. + * + * Ideally we would just read the remaining CSB entries now that we + * know the gpu is idle. However, the CSB registers are sometimes^W + * often trashed across a GPU reset! Instead we have to rely on + * guessing the missed context-switch events by looking at what + * requests were completed. + */ + execlists_cancel_port_requests(execlists); + + rq = active_request(rq); + if (!rq) + goto out_replay; + + /* + * If this request hasn't started yet, e.g. it is waiting on a + * semaphore, we need to avoid skipping the request or else we + * break the signaling chain. However, if the context is corrupt + * the request will not restart and we will be stuck with a wedged + * device. It is quite often the case that if we issue a reset + * while the GPU is loading the context image, that the context + * image becomes corrupt. + * + * Otherwise, if we have not started yet, the request should replay + * perfectly and we do not need to flag the result as being erroneous. + */ + if (!i915_request_started(rq) && lrc_regs_ok(rq)) + goto out_replay; + + /* + * If the request was innocent, we leave the request in the ELSP + * and will try to replay it on restarting. The context image may + * have been corrupted by the reset, in which case we may have + * to service a new GPU hang, but more likely we can continue on + * without impact. + * + * If the request was guilty, we presume the context is corrupt + * and have to at least restore the RING register in the context + * image back to the expected values to skip over the guilty request. + */ + i915_reset_request(rq, stalled); + if (!stalled && lrc_regs_ok(rq)) + goto out_replay; + + /* + * We want a simple context + ring to execute the breadcrumb update. + * We cannot rely on the context being intact across the GPU hang, + * so clear it and rebuild just what we need for the breadcrumb. + * All pending requests for this context will be zapped, and any + * future request will be after userspace has had the opportunity + * to recreate its own state. + */ + regs = ce->lrc_reg_state; + if (engine->pinned_default_state) { + memcpy(regs, /* skip restoring the vanilla PPHWSP */ + engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, + engine->context_size - PAGE_SIZE); + } + execlists_init_reg_state(regs, ce, engine, ce->ring); + +out_replay: + /* Rerun the request; its payload has been neutered (if guilty). */ + ce->ring->head = + rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail; + intel_ring_update_space(ce->ring); + __execlists_update_reg_state(ce, engine); + + /* Push back any incomplete requests for replay after the reset. */ + __unwind_incomplete_requests(engine); + +out_clear: + execlists_clear_all_active(execlists); +} + +static void execlists_reset(struct intel_engine_cs *engine, bool stalled) +{ + unsigned long flags; + + GEM_TRACE("%s\n", engine->name); + + spin_lock_irqsave(&engine->active.lock, flags); + + __execlists_reset(engine, stalled); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void nop_submission_tasklet(unsigned long data) +{ + /* The driver is wedged; don't process any more events. */ +} + +static void execlists_cancel_requests(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_request *rq, *rn; + struct rb_node *rb; + unsigned long flags; + + GEM_TRACE("%s\n", engine->name); + + /* + * Before we call engine->cancel_requests(), we should have exclusive + * access to the submission state. This is arranged for us by the + * caller disabling the interrupt generation, the tasklet and other + * threads that may then access the same state, giving us a free hand + * to reset state. However, we still need to let lockdep be aware that + * we know this state may be accessed in hardirq context, so we + * disable the irq around this manipulation and we want to keep + * the spinlock focused on its duties and not accidentally conflate + * coverage to the submission's irq state. (Similarly, although we + * shouldn't need to disable irq around the manipulation of the + * submission's irq state, we also wish to remind ourselves that + * it is irq state.) + */ + spin_lock_irqsave(&engine->active.lock, flags); + + __execlists_reset(engine, true); + + /* Mark all executing requests as skipped. */ + list_for_each_entry(rq, &engine->active.requests, sched.link) { + if (!i915_request_signaled(rq)) + dma_fence_set_error(&rq->fence, -EIO); + + i915_request_mark_complete(rq); + } + + /* Flush the queued requests to the timeline list (for retiring). */ + while ((rb = rb_first_cached(&execlists->queue))) { + struct i915_priolist *p = to_priolist(rb); + int i; + + priolist_for_each_request_consume(rq, rn, p, i) { + list_del_init(&rq->sched.link); + __i915_request_submit(rq); + dma_fence_set_error(&rq->fence, -EIO); + i915_request_mark_complete(rq); + } + + rb_erase_cached(&p->node, &execlists->queue); + i915_priolist_free(p); + } + + /* Cancel all attached virtual engines */ + while ((rb = rb_first_cached(&execlists->virtual))) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + + rb_erase_cached(rb, &execlists->virtual); + RB_CLEAR_NODE(rb); + + spin_lock(&ve->base.active.lock); + if (ve->request) { + ve->request->engine = engine; + __i915_request_submit(ve->request); + dma_fence_set_error(&ve->request->fence, -EIO); + i915_request_mark_complete(ve->request); + ve->base.execlists.queue_priority_hint = INT_MIN; + ve->request = NULL; + } + spin_unlock(&ve->base.active.lock); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + execlists->queue_priority_hint = INT_MIN; + execlists->queue = RB_ROOT_CACHED; + GEM_BUG_ON(port_isset(execlists->port)); + + GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); + execlists->tasklet.func = nop_submission_tasklet; + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void execlists_reset_finish(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + + /* + * After a GPU reset, we may have requests to replay. Do so now while + * we still have the forcewake to be sure that the GPU is not allowed + * to sleep before we restart and reload a context. + */ + GEM_BUG_ON(!reset_in_progress(execlists)); + if (!RB_EMPTY_ROOT(&execlists->queue.rb_root)) + execlists->tasklet.func(execlists->tasklet.data); + + if (__tasklet_enable(&execlists->tasklet)) + /* And kick in case we missed a new request submission. */ + tasklet_hi_schedule(&execlists->tasklet); + GEM_TRACE("%s: depth->%d\n", engine->name, + atomic_read(&execlists->tasklet.count)); +} + +static int gen8_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * WaDisableCtxRestoreArbitration:bdw,chv + * + * We don't need to perform MI_ARB_ENABLE as often as we do (in + * particular all the gen that do not need the w/a at all!), if we + * took care to make sure that on every switch into this context + * (both ordinary and for preemption) that arbitrartion was enabled + * we would be fine. However, for gen8 there is another w/a that + * requires us to not preempt inside GPGPU execution, so we keep + * arbitration disabled for gen8 batches. Arbitration will be + * re-enabled before we close the request + * (engine->emit_fini_breadcrumb). + */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + + /* FIXME(BDW+): Address space and security selectors. */ + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen9_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + const unsigned int flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, + ~(engine->irq_enable_mask | engine->irq_keep_mask)); + ENGINE_POSTING_READ(engine, RING_IMR); +} + +static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); +} + +static int gen8_emit_flush(struct i915_request *request, u32 mode) +{ + u32 cmd, *cs; + + cs = intel_ring_begin(request, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_FLUSH_DW + 1; + + /* We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + if (mode & EMIT_INVALIDATE) { + cmd |= MI_INVALIDATE_TLB; + if (request->engine->class == VIDEO_DECODE_CLASS) + cmd |= MI_INVALIDATE_BSD; + } + + *cs++ = cmd; + *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; /* upper addr */ + *cs++ = 0; /* value */ + intel_ring_advance(request, cs); + + return 0; +} + +static int gen8_emit_flush_render(struct i915_request *request, + u32 mode) +{ + struct intel_engine_cs *engine = request->engine; + u32 scratch_addr = + i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES; + bool vf_flush_wa = false, dc_flush_wa = false; + u32 *cs, flags = 0; + int len; + + flags |= PIPE_CONTROL_CS_STALL; + + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + } + + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; + + /* + * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL + * pipe control. + */ + if (IS_GEN(request->i915, 9)) + vf_flush_wa = true; + + /* WaForGAMHang:kbl */ + if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) + dc_flush_wa = true; + } + + len = 6; + + if (vf_flush_wa) + len += 6; + + if (dc_flush_wa) + len += 12; + + cs = intel_ring_begin(request, len); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + if (vf_flush_wa) + cs = gen8_emit_pipe_control(cs, 0, 0); + + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); + + cs = gen8_emit_pipe_control(cs, flags, scratch_addr); + + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); + + intel_ring_advance(request, cs); + + return 0; +} + +/* + * Reserve space for 2 NOOPs at the end of each request to be + * used as a workaround for not being allowed to do lite + * restore with HEAD==TAIL (WaIdleLiteRestore). + */ +static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) +{ + /* Ensure there's always at least one preemption point per-request. */ + *cs++ = MI_ARB_CHECK; + *cs++ = MI_NOOP; + request->wa_tail = intel_ring_offset(request, cs); + + return cs; +} + +static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) +{ + cs = gen8_emit_ggtt_write(cs, + request->fence.seqno, + request->timeline->hwsp_offset, + 0); + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); + + return gen8_emit_wa_tail(request, cs); +} + +static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) +{ + /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */ + cs = gen8_emit_ggtt_write_rcs(cs, + request->fence.seqno, + request->timeline->hwsp_offset, + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE); + cs = gen8_emit_pipe_control(cs, + PIPE_CONTROL_FLUSH_ENABLE | + PIPE_CONTROL_CS_STALL, + 0); + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); + + return gen8_emit_wa_tail(request, cs); +} + +static int gen8_init_rcs_context(struct i915_request *rq) +{ + int ret; + + ret = intel_engine_emit_ctx_wa(rq); + if (ret) + return ret; + + ret = intel_rcs_context_init_mocs(rq); + /* + * Failing to program the MOCS is non-fatal.The system will not + * run at peak performance. So generate an error and carry on. + */ + if (ret) + DRM_ERROR("MOCS failed to program: expect performance issues.\n"); + + return i915_gem_render_state_emit(rq); +} + +static void execlists_park(struct intel_engine_cs *engine) +{ + intel_engine_park(engine); +} + +void intel_execlists_set_default_submission(struct intel_engine_cs *engine) +{ + engine->submit_request = execlists_submit_request; + engine->cancel_requests = execlists_cancel_requests; + engine->schedule = i915_schedule; + engine->execlists.tasklet.func = execlists_submission_tasklet; + + engine->reset.prepare = execlists_reset_prepare; + engine->reset.reset = execlists_reset; + engine->reset.finish = execlists_reset_finish; + + engine->park = execlists_park; + engine->unpark = NULL; + + engine->flags |= I915_ENGINE_SUPPORTS_STATS; + if (!intel_vgpu_active(engine->i915)) + engine->flags |= I915_ENGINE_HAS_SEMAPHORES; + if (engine->preempt_context && + HAS_LOGICAL_RING_PREEMPTION(engine->i915)) + engine->flags |= I915_ENGINE_HAS_PREEMPTION; +} + +static void execlists_destroy(struct intel_engine_cs *engine) +{ + intel_engine_cleanup_common(engine); + lrc_destroy_wa_ctx(engine); + kfree(engine); +} + +static void +logical_ring_default_vfuncs(struct intel_engine_cs *engine) +{ + /* Default vfuncs which can be overriden by each engine. */ + + engine->destroy = execlists_destroy; + engine->resume = execlists_resume; + + engine->reset.prepare = execlists_reset_prepare; + engine->reset.reset = execlists_reset; + engine->reset.finish = execlists_reset_finish; + + engine->cops = &execlists_context_ops; + engine->request_alloc = execlists_request_alloc; + + engine->emit_flush = gen8_emit_flush; + engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; + engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; + + engine->set_default_submission = intel_execlists_set_default_submission; + + if (INTEL_GEN(engine->i915) < 11) { + engine->irq_enable = gen8_logical_ring_enable_irq; + engine->irq_disable = gen8_logical_ring_disable_irq; + } else { + /* + * TODO: On Gen11 interrupt masks need to be clear + * to allow C6 entry. Keep interrupts enabled at + * and take the hit of generating extra interrupts + * until a more refined solution exists. + */ + } + if (IS_GEN(engine->i915, 8)) + engine->emit_bb_start = gen8_emit_bb_start; + else + engine->emit_bb_start = gen9_emit_bb_start; +} + +static inline void +logical_ring_default_irqs(struct intel_engine_cs *engine) +{ + unsigned int shift = 0; + + if (INTEL_GEN(engine->i915) < 11) { + const u8 irq_shifts[] = { + [RCS0] = GEN8_RCS_IRQ_SHIFT, + [BCS0] = GEN8_BCS_IRQ_SHIFT, + [VCS0] = GEN8_VCS0_IRQ_SHIFT, + [VCS1] = GEN8_VCS1_IRQ_SHIFT, + [VECS0] = GEN8_VECS_IRQ_SHIFT, + }; + + shift = irq_shifts[engine->id]; + } + + engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; + engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; +} + +int intel_execlists_submission_setup(struct intel_engine_cs *engine) +{ + /* Intentionally left blank. */ + engine->buffer = NULL; + + tasklet_init(&engine->execlists.tasklet, + execlists_submission_tasklet, (unsigned long)engine); + + logical_ring_default_vfuncs(engine); + logical_ring_default_irqs(engine); + + if (engine->class == RENDER_CLASS) { + engine->init_context = gen8_init_rcs_context; + engine->emit_flush = gen8_emit_flush_render; + engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs; + } + + return 0; +} + +int intel_execlists_submission_init(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct drm_i915_private *i915 = engine->i915; + struct intel_uncore *uncore = engine->uncore; + u32 base = engine->mmio_base; + int ret; + + ret = intel_engine_init_common(engine); + if (ret) + return ret; + + intel_engine_init_workarounds(engine); + intel_engine_init_whitelist(engine); + + if (intel_init_workaround_bb(engine)) + /* + * We continue even if we fail to initialize WA batch + * because we only expect rare glitches but nothing + * critical to prevent us from using GPU + */ + DRM_ERROR("WA batch buffer initialization failed\n"); + + if (HAS_LOGICAL_RING_ELSQ(i915)) { + execlists->submit_reg = uncore->regs + + i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base)); + execlists->ctrl_reg = uncore->regs + + i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base)); + } else { + execlists->submit_reg = uncore->regs + + i915_mmio_reg_offset(RING_ELSP(base)); + } + + execlists->preempt_complete_status = ~0u; + if (engine->preempt_context) + execlists->preempt_complete_status = + upper_32_bits(engine->preempt_context->lrc_desc); + + execlists->csb_status = + &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; + + execlists->csb_write = + &engine->status_page.addr[intel_hws_csb_write_index(i915)]; + + if (INTEL_GEN(i915) < 11) + execlists->csb_size = GEN8_CSB_ENTRIES; + else + execlists->csb_size = GEN11_CSB_ENTRIES; + + reset_csb_pointers(execlists); + + return 0; +} + +static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) +{ + u32 indirect_ctx_offset; + + switch (INTEL_GEN(engine->i915)) { + default: + MISSING_CASE(INTEL_GEN(engine->i915)); + /* fall through */ + case 11: + indirect_ctx_offset = + GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + break; + case 10: + indirect_ctx_offset = + GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + break; + case 9: + indirect_ctx_offset = + GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + break; + case 8: + indirect_ctx_offset = + GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + break; + } + + return indirect_ctx_offset; +} + +static void execlists_init_reg_state(u32 *regs, + struct intel_context *ce, + struct intel_engine_cs *engine, + struct intel_ring *ring) +{ + struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(ce->gem_context->vm); + bool rcs = engine->class == RENDER_CLASS; + u32 base = engine->mmio_base; + + /* + * A context is actually a big batch buffer with several + * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The + * values we are setting here are only for the first context restore: + * on a subsequent save, the GPU will recreate this batchbuffer with new + * values (including all the missing MI_LOAD_REGISTER_IMM commands that + * we are not initializing here). + * + * Must keep consistent with virtual_update_register_offsets(). + */ + regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | + MI_LRI_FORCE_POSTED; + + CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base), + _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) | + _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH)); + if (INTEL_GEN(engine->i915) < 11) { + regs[CTX_CONTEXT_CONTROL + 1] |= + _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT | + CTX_CTRL_RS_CTX_ENABLE); + } + CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0); + CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0); + CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0); + CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base), + RING_CTL_SIZE(ring->size) | RING_VALID); + CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0); + CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0); + CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT); + CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0); + CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); + CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); + if (rcs) { + struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; + + CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); + CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, + RING_INDIRECT_CTX_OFFSET(base), 0); + if (wa_ctx->indirect_ctx.size) { + u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); + + regs[CTX_RCS_INDIRECT_CTX + 1] = + (ggtt_offset + wa_ctx->indirect_ctx.offset) | + (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); + + regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = + intel_lr_indirect_ctx_offset(engine) << 6; + } + + CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); + if (wa_ctx->per_ctx.size) { + u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); + + regs[CTX_BB_PER_CTX_PTR + 1] = + (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; + } + } + + regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; + + CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0); + /* PDP values well be assigned later if needed */ + CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(base, 3), 0); + CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(base, 3), 0); + CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(base, 2), 0); + CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(base, 2), 0); + CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(base, 1), 0); + CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(base, 1), 0); + CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(base, 0), 0); + CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(base, 0), 0); + + if (i915_vm_is_4lvl(&ppgtt->vm)) { + /* 64b PPGTT (48bit canonical) + * PDP0_DESCRIPTOR contains the base address to PML4 and + * other PDP Descriptors are ignored. + */ + ASSIGN_CTX_PML4(ppgtt, regs); + } else { + ASSIGN_CTX_PDP(ppgtt, regs, 3); + ASSIGN_CTX_PDP(ppgtt, regs, 2); + ASSIGN_CTX_PDP(ppgtt, regs, 1); + ASSIGN_CTX_PDP(ppgtt, regs, 0); + } + + if (rcs) { + regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); + CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0); + + i915_oa_init_reg_state(engine, ce, regs); + } + + regs[CTX_END] = MI_BATCH_BUFFER_END; + if (INTEL_GEN(engine->i915) >= 10) + regs[CTX_END] |= BIT(0); +} + +static int +populate_lr_context(struct intel_context *ce, + struct drm_i915_gem_object *ctx_obj, + struct intel_engine_cs *engine, + struct intel_ring *ring) +{ + void *vaddr; + u32 *regs; + int ret; + + vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + ret = PTR_ERR(vaddr); + DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); + return ret; + } + + if (engine->default_state) { + /* + * We only want to copy over the template context state; + * skipping over the headers reserved for GuC communication, + * leaving those as zero. + */ + const unsigned long start = LRC_HEADER_PAGES * PAGE_SIZE; + void *defaults; + + defaults = i915_gem_object_pin_map(engine->default_state, + I915_MAP_WB); + if (IS_ERR(defaults)) { + ret = PTR_ERR(defaults); + goto err_unpin_ctx; + } + + memcpy(vaddr + start, defaults + start, engine->context_size); + i915_gem_object_unpin_map(engine->default_state); + } + + /* The second page of the context object contains some fields which must + * be set up prior to the first execution. */ + regs = vaddr + LRC_STATE_PN * PAGE_SIZE; + execlists_init_reg_state(regs, ce, engine, ring); + if (!engine->default_state) + regs[CTX_CONTEXT_CONTROL + 1] |= + _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); + if (ce->gem_context == engine->i915->preempt_context && + INTEL_GEN(engine->i915) < 11) + regs[CTX_CONTEXT_CONTROL + 1] |= + _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | + CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT); + + ret = 0; +err_unpin_ctx: + __i915_gem_object_flush_map(ctx_obj, + LRC_HEADER_PAGES * PAGE_SIZE, + engine->context_size); + i915_gem_object_unpin_map(ctx_obj); + return ret; +} + +static struct i915_timeline *get_timeline(struct i915_gem_context *ctx) +{ + if (ctx->timeline) + return i915_timeline_get(ctx->timeline); + else + return i915_timeline_create(ctx->i915, NULL); +} + +static int execlists_context_deferred_alloc(struct intel_context *ce, + struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *ctx_obj; + struct i915_vma *vma; + u32 context_size; + struct intel_ring *ring; + struct i915_timeline *timeline; + int ret; + + if (ce->state) + return 0; + + context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); + + /* + * Before the actual start of the context image, we insert a few pages + * for our own use and for sharing with the GuC. + */ + context_size += LRC_HEADER_PAGES * PAGE_SIZE; + + ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); + if (IS_ERR(ctx_obj)) + return PTR_ERR(ctx_obj); + + vma = i915_vma_instance(ctx_obj, &engine->i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto error_deref_obj; + } + + timeline = get_timeline(ce->gem_context); + if (IS_ERR(timeline)) { + ret = PTR_ERR(timeline); + goto error_deref_obj; + } + + ring = intel_engine_create_ring(engine, + timeline, + ce->gem_context->ring_size); + i915_timeline_put(timeline); + if (IS_ERR(ring)) { + ret = PTR_ERR(ring); + goto error_deref_obj; + } + + ret = populate_lr_context(ce, ctx_obj, engine, ring); + if (ret) { + DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); + goto error_ring_free; + } + + ce->ring = ring; + ce->state = vma; + + return 0; + +error_ring_free: + intel_ring_put(ring); +error_deref_obj: + i915_gem_object_put(ctx_obj); + return ret; +} + +static struct list_head *virtual_queue(struct virtual_engine *ve) +{ + return &ve->base.execlists.default_priolist.requests[0]; +} + +static void virtual_context_destroy(struct kref *kref) +{ + struct virtual_engine *ve = + container_of(kref, typeof(*ve), context.ref); + unsigned int n; + + GEM_BUG_ON(!list_empty(virtual_queue(ve))); + GEM_BUG_ON(ve->request); + GEM_BUG_ON(ve->context.inflight); + + for (n = 0; n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct rb_node *node = &ve->nodes[sibling->id].rb; + + if (RB_EMPTY_NODE(node)) + continue; + + spin_lock_irq(&sibling->active.lock); + + /* Detachment is lazily performed in the execlists tasklet */ + if (!RB_EMPTY_NODE(node)) + rb_erase_cached(node, &sibling->execlists.virtual); + + spin_unlock_irq(&sibling->active.lock); + } + GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet)); + + if (ve->context.state) + __execlists_context_fini(&ve->context); + + kfree(ve->bonds); + kfree(ve); +} + +static void virtual_engine_initial_hint(struct virtual_engine *ve) +{ + int swp; + + /* + * Pick a random sibling on starting to help spread the load around. + * + * New contexts are typically created with exactly the same order + * of siblings, and often started in batches. Due to the way we iterate + * the array of sibling when submitting requests, sibling[0] is + * prioritised for dequeuing. If we make sure that sibling[0] is fairly + * randomised across the system, we also help spread the load by the + * first engine we inspect being different each time. + * + * NB This does not force us to execute on this engine, it will just + * typically be the first we inspect for submission. + */ + swp = prandom_u32_max(ve->num_siblings); + if (!swp) + return; + + swap(ve->siblings[swp], ve->siblings[0]); + virtual_update_register_offsets(ve->context.lrc_reg_state, + ve->siblings[0]); +} + +static int virtual_context_pin(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + int err; + + /* Note: we must use a real engine class for setting up reg state */ + err = __execlists_context_pin(ce, ve->siblings[0]); + if (err) + return err; + + virtual_engine_initial_hint(ve); + return 0; +} + +static void virtual_context_enter(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_get(ve->siblings[n]); +} + +static void virtual_context_exit(struct intel_context *ce) +{ + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); + unsigned int n; + + for (n = 0; n < ve->num_siblings; n++) + intel_engine_pm_put(ve->siblings[n]); +} + +static const struct intel_context_ops virtual_context_ops = { + .pin = virtual_context_pin, + .unpin = execlists_context_unpin, + + .enter = virtual_context_enter, + .exit = virtual_context_exit, + + .destroy = virtual_context_destroy, +}; + +static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve) +{ + struct i915_request *rq; + intel_engine_mask_t mask; + + rq = READ_ONCE(ve->request); + if (!rq) + return 0; + + /* The rq is ready for submission; rq->execution_mask is now stable. */ + mask = rq->execution_mask; + if (unlikely(!mask)) { + /* Invalid selection, submit to a random engine in error */ + i915_request_skip(rq, -ENODEV); + mask = ve->siblings[0]->mask; + } + + GEM_TRACE("%s: rq=%llx:%lld, mask=%x, prio=%d\n", + ve->base.name, + rq->fence.context, rq->fence.seqno, + mask, ve->base.execlists.queue_priority_hint); + + return mask; +} + +static void virtual_submission_tasklet(unsigned long data) +{ + struct virtual_engine * const ve = (struct virtual_engine *)data; + const int prio = ve->base.execlists.queue_priority_hint; + intel_engine_mask_t mask; + unsigned int n; + + rcu_read_lock(); + mask = virtual_submission_mask(ve); + rcu_read_unlock(); + if (unlikely(!mask)) + return; + + local_irq_disable(); + for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = ve->siblings[n]; + struct ve_node * const node = &ve->nodes[sibling->id]; + struct rb_node **parent, *rb; + bool first; + + if (unlikely(!(mask & sibling->mask))) { + if (!RB_EMPTY_NODE(&node->rb)) { + spin_lock(&sibling->active.lock); + rb_erase_cached(&node->rb, + &sibling->execlists.virtual); + RB_CLEAR_NODE(&node->rb); + spin_unlock(&sibling->active.lock); + } + continue; + } + + spin_lock(&sibling->active.lock); + + if (!RB_EMPTY_NODE(&node->rb)) { + /* + * Cheat and avoid rebalancing the tree if we can + * reuse this node in situ. + */ + first = rb_first_cached(&sibling->execlists.virtual) == + &node->rb; + if (prio == node->prio || (prio > node->prio && first)) + goto submit_engine; + + rb_erase_cached(&node->rb, &sibling->execlists.virtual); + } + + rb = NULL; + first = true; + parent = &sibling->execlists.virtual.rb_root.rb_node; + while (*parent) { + struct ve_node *other; + + rb = *parent; + other = rb_entry(rb, typeof(*other), rb); + if (prio > other->prio) { + parent = &rb->rb_left; + } else { + parent = &rb->rb_right; + first = false; + } + } + + rb_link_node(&node->rb, rb, parent); + rb_insert_color_cached(&node->rb, + &sibling->execlists.virtual, + first); + +submit_engine: + GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); + node->prio = prio; + if (first && prio > sibling->execlists.queue_priority_hint) { + sibling->execlists.queue_priority_hint = prio; + tasklet_hi_schedule(&sibling->execlists.tasklet); + } + + spin_unlock(&sibling->active.lock); + } + local_irq_enable(); +} + +static void virtual_submit_request(struct i915_request *rq) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + + GEM_TRACE("%s: rq=%llx:%lld\n", + ve->base.name, + rq->fence.context, + rq->fence.seqno); + + GEM_BUG_ON(ve->base.submit_request != virtual_submit_request); + + GEM_BUG_ON(ve->request); + GEM_BUG_ON(!list_empty(virtual_queue(ve))); + + ve->base.execlists.queue_priority_hint = rq_prio(rq); + WRITE_ONCE(ve->request, rq); + + list_move_tail(&rq->sched.link, virtual_queue(ve)); + + tasklet_schedule(&ve->base.execlists.tasklet); +} + +static struct ve_bond * +virtual_find_bond(struct virtual_engine *ve, + const struct intel_engine_cs *master) +{ + int i; + + for (i = 0; i < ve->num_bonds; i++) { + if (ve->bonds[i].master == master) + return &ve->bonds[i]; + } + + return NULL; +} + +static void +virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal) +{ + struct virtual_engine *ve = to_virtual_engine(rq->engine); + struct ve_bond *bond; + + bond = virtual_find_bond(ve, to_request(signal)->engine); + if (bond) { + intel_engine_mask_t old, new, cmp; + + cmp = READ_ONCE(rq->execution_mask); + do { + old = cmp; + new = cmp & bond->sibling_mask; + } while ((cmp = cmpxchg(&rq->execution_mask, old, new)) != old); + } +} + +struct intel_context * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count) +{ + struct virtual_engine *ve; + unsigned int n; + int err; + + if (count == 0) + return ERR_PTR(-EINVAL); + + if (count == 1) + return intel_context_create(ctx, siblings[0]); + + ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL); + if (!ve) + return ERR_PTR(-ENOMEM); + + ve->base.i915 = ctx->i915; + ve->base.id = -1; + ve->base.class = OTHER_CLASS; + ve->base.uabi_class = I915_ENGINE_CLASS_INVALID; + ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL; + ve->base.flags = I915_ENGINE_IS_VIRTUAL; + + /* + * The decision on whether to submit a request using semaphores + * depends on the saturated state of the engine. We only compute + * this during HW submission of the request, and we need for this + * state to be globally applied to all requests being submitted + * to this engine. Virtual engines encompass more than one physical + * engine and so we cannot accurately tell in advance if one of those + * engines is already saturated and so cannot afford to use a semaphore + * and be pessimized in priority for doing so -- if we are the only + * context using semaphores after all other clients have stopped, we + * will be starved on the saturated system. Such a global switch for + * semaphores is less than ideal, but alas is the current compromise. + */ + ve->base.saturated = ALL_ENGINES; + + snprintf(ve->base.name, sizeof(ve->base.name), "virtual"); + + intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); + + intel_engine_init_execlists(&ve->base); + + ve->base.cops = &virtual_context_ops; + ve->base.request_alloc = execlists_request_alloc; + + ve->base.schedule = i915_schedule; + ve->base.submit_request = virtual_submit_request; + ve->base.bond_execute = virtual_bond_execute; + + INIT_LIST_HEAD(virtual_queue(ve)); + ve->base.execlists.queue_priority_hint = INT_MIN; + tasklet_init(&ve->base.execlists.tasklet, + virtual_submission_tasklet, + (unsigned long)ve); + + intel_context_init(&ve->context, ctx, &ve->base); + + for (n = 0; n < count; n++) { + struct intel_engine_cs *sibling = siblings[n]; + + GEM_BUG_ON(!is_power_of_2(sibling->mask)); + if (sibling->mask & ve->base.mask) { + DRM_DEBUG("duplicate %s entry in load balancer\n", + sibling->name); + err = -EINVAL; + goto err_put; + } + + /* + * The virtual engine implementation is tightly coupled to + * the execlists backend -- we push out request directly + * into a tree inside each physical engine. We could support + * layering if we handle cloning of the requests and + * submitting a copy into each backend. + */ + if (sibling->execlists.tasklet.func != + execlists_submission_tasklet) { + err = -ENODEV; + goto err_put; + } + + GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)); + RB_CLEAR_NODE(&ve->nodes[sibling->id].rb); + + ve->siblings[ve->num_siblings++] = sibling; + ve->base.mask |= sibling->mask; + + /* + * All physical engines must be compatible for their emission + * functions (as we build the instructions during request + * construction and do not alter them before submission + * on the physical engine). We use the engine class as a guide + * here, although that could be refined. + */ + if (ve->base.class != OTHER_CLASS) { + if (ve->base.class != sibling->class) { + DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n", + sibling->class, ve->base.class); + err = -EINVAL; + goto err_put; + } + continue; + } + + ve->base.class = sibling->class; + ve->base.uabi_class = sibling->uabi_class; + snprintf(ve->base.name, sizeof(ve->base.name), + "v%dx%d", ve->base.class, count); + ve->base.context_size = sibling->context_size; + + ve->base.emit_bb_start = sibling->emit_bb_start; + ve->base.emit_flush = sibling->emit_flush; + ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb; + ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb; + ve->base.emit_fini_breadcrumb_dw = + sibling->emit_fini_breadcrumb_dw; + } + + return &ve->context; + +err_put: + intel_context_put(&ve->context); + return ERR_PTR(err); +} + +struct intel_context * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src) +{ + struct virtual_engine *se = to_virtual_engine(src); + struct intel_context *dst; + + dst = intel_execlists_create_virtual(ctx, + se->siblings, + se->num_siblings); + if (IS_ERR(dst)) + return dst; + + if (se->num_bonds) { + struct virtual_engine *de = to_virtual_engine(dst->engine); + + de->bonds = kmemdup(se->bonds, + sizeof(*se->bonds) * se->num_bonds, + GFP_KERNEL); + if (!de->bonds) { + intel_context_put(dst); + return ERR_PTR(-ENOMEM); + } + + de->num_bonds = se->num_bonds; + } + + return dst; +} + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + const struct intel_engine_cs *master, + const struct intel_engine_cs *sibling) +{ + struct virtual_engine *ve = to_virtual_engine(engine); + struct ve_bond *bond; + int n; + + /* Sanity check the sibling is part of the virtual engine */ + for (n = 0; n < ve->num_siblings; n++) + if (sibling == ve->siblings[n]) + break; + if (n == ve->num_siblings) + return -EINVAL; + + bond = virtual_find_bond(ve, master); + if (bond) { + bond->sibling_mask |= sibling->mask; + return 0; + } + + bond = krealloc(ve->bonds, + sizeof(*bond) * (ve->num_bonds + 1), + GFP_KERNEL); + if (!bond) + return -ENOMEM; + + bond[ve->num_bonds].master = master; + bond[ve->num_bonds].sibling_mask = sibling->mask; + + ve->bonds = bond; + ve->num_bonds++; + + return 0; +} + +void intel_execlists_show_requests(struct intel_engine_cs *engine, + struct drm_printer *m, + void (*show_request)(struct drm_printer *m, + struct i915_request *rq, + const char *prefix), + unsigned int max) +{ + const struct intel_engine_execlists *execlists = &engine->execlists; + struct i915_request *rq, *last; + unsigned long flags; + unsigned int count; + struct rb_node *rb; + + spin_lock_irqsave(&engine->active.lock, flags); + + last = NULL; + count = 0; + list_for_each_entry(rq, &engine->active.requests, sched.link) { + if (count++ < max - 1) + show_request(m, rq, "\t\tE "); + else + last = rq; + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d executing requests...\n", + count - max); + } + show_request(m, last, "\t\tE "); + } + + last = NULL; + count = 0; + if (execlists->queue_priority_hint != INT_MIN) + drm_printf(m, "\t\tQueue priority hint: %d\n", + execlists->queue_priority_hint); + for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) { + struct i915_priolist *p = rb_entry(rb, typeof(*p), node); + int i; + + priolist_for_each_request(rq, p, i) { + if (count++ < max - 1) + show_request(m, rq, "\t\tQ "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d queued requests...\n", + count - max); + } + show_request(m, last, "\t\tQ "); + } + + last = NULL; + count = 0; + for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) { + struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + struct i915_request *rq = READ_ONCE(ve->request); + + if (rq) { + if (count++ < max - 1) + show_request(m, rq, "\t\tV "); + else + last = rq; + } + } + if (last) { + if (count > max) { + drm_printf(m, + "\t\t...skipping %d virtual requests...\n", + count - max); + } + show_request(m, last, "\t\tV "); + } + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +void intel_lr_context_reset(struct intel_engine_cs *engine, + struct intel_context *ce, + u32 head, + bool scrub) +{ + /* + * We want a simple context + ring to execute the breadcrumb update. + * We cannot rely on the context being intact across the GPU hang, + * so clear it and rebuild just what we need for the breadcrumb. + * All pending requests for this context will be zapped, and any + * future request will be after userspace has had the opportunity + * to recreate its own state. + */ + if (scrub) { + u32 *regs = ce->lrc_reg_state; + + if (engine->pinned_default_state) { + memcpy(regs, /* skip restoring the vanilla PPHWSP */ + engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, + engine->context_size - PAGE_SIZE); + } + execlists_init_reg_state(regs, ce, engine, ce->ring); + } + + /* Rerun the request; its payload has been neutered (if guilty). */ + ce->ring->head = head; + intel_ring_update_space(ce->ring); + + __execlists_update_reg_state(ce, engine); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_lrc.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h new file mode 100644 index 000000000000..c2bba82bcc16 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc.h @@ -0,0 +1,134 @@ +/* + * Copyright © 2014 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#ifndef _INTEL_LRC_H_ +#define _INTEL_LRC_H_ + +#include <linux/types.h> + +struct drm_printer; + +struct drm_i915_private; +struct i915_gem_context; +struct i915_request; +struct intel_context; +struct intel_engine_cs; + +/* Execlists regs */ +#define RING_ELSP(base) _MMIO((base) + 0x230) +#define RING_EXECLIST_STATUS_LO(base) _MMIO((base) + 0x234) +#define RING_EXECLIST_STATUS_HI(base) _MMIO((base) + 0x234 + 4) +#define RING_CONTEXT_CONTROL(base) _MMIO((base) + 0x244) +#define CTX_CTRL_INHIBIT_SYN_CTX_SWITCH (1 << 3) +#define CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT (1 << 0) +#define CTX_CTRL_RS_CTX_ENABLE (1 << 1) +#define CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT (1 << 2) +#define RING_CONTEXT_STATUS_PTR(base) _MMIO((base) + 0x3a0) +#define RING_EXECLIST_SQ_CONTENTS(base) _MMIO((base) + 0x510) +#define RING_EXECLIST_CONTROL(base) _MMIO((base) + 0x550) + +#define EL_CTRL_LOAD (1 << 0) + +/* The docs specify that the write pointer wraps around after 5h, "After status + * is written out to the last available status QW at offset 5h, this pointer + * wraps to 0." + * + * Therefore, one must infer than even though there are 3 bits available, 6 and + * 7 appear to be * reserved. + */ +#define GEN8_CSB_ENTRIES 6 +#define GEN8_CSB_PTR_MASK 0x7 +#define GEN8_CSB_READ_PTR_MASK (GEN8_CSB_PTR_MASK << 8) +#define GEN8_CSB_WRITE_PTR_MASK (GEN8_CSB_PTR_MASK << 0) + +#define GEN11_CSB_ENTRIES 12 +#define GEN11_CSB_PTR_MASK 0xf +#define GEN11_CSB_READ_PTR_MASK (GEN11_CSB_PTR_MASK << 8) +#define GEN11_CSB_WRITE_PTR_MASK (GEN11_CSB_PTR_MASK << 0) + +enum { + INTEL_CONTEXT_SCHEDULE_IN = 0, + INTEL_CONTEXT_SCHEDULE_OUT, + INTEL_CONTEXT_SCHEDULE_PREEMPTED, +}; + +/* Logical Rings */ +void intel_logical_ring_cleanup(struct intel_engine_cs *engine); + +int intel_execlists_submission_setup(struct intel_engine_cs *engine); +int intel_execlists_submission_init(struct intel_engine_cs *engine); + +/* Logical Ring Contexts */ + +/* + * We allocate a header at the start of the context image for our own + * use, therefore the actual location of the logical state is offset + * from the start of the VMA. The layout is + * + * | [guc] | [hwsp] [logical state] | + * |<- our header ->|<- context image ->| + * + */ +/* The first page is used for sharing data with the GuC */ +#define LRC_GUCSHR_PN (0) +#define LRC_GUCSHR_SZ (1) +/* At the start of the context image is its per-process HWS page */ +#define LRC_PPHWSP_PN (LRC_GUCSHR_PN + LRC_GUCSHR_SZ) +#define LRC_PPHWSP_SZ (1) +/* Finally we have the logical state for the context */ +#define LRC_STATE_PN (LRC_PPHWSP_PN + LRC_PPHWSP_SZ) + +/* + * Currently we include the PPHWSP in __intel_engine_context_size() so + * the size of the header is synonymous with the start of the PPHWSP. + */ +#define LRC_HEADER_PAGES LRC_PPHWSP_PN + +void intel_execlists_set_default_submission(struct intel_engine_cs *engine); + +void intel_lr_context_reset(struct intel_engine_cs *engine, + struct intel_context *ce, + u32 head, + bool scrub); + +void intel_execlists_show_requests(struct intel_engine_cs *engine, + struct drm_printer *m, + void (*show_request)(struct drm_printer *m, + struct i915_request *rq, + const char *prefix), + unsigned int max); + +struct intel_context * +intel_execlists_create_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs **siblings, + unsigned int count); + +struct intel_context * +intel_execlists_clone_virtual(struct i915_gem_context *ctx, + struct intel_engine_cs *src); + +int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine, + const struct intel_engine_cs *master, + const struct intel_engine_cs *sibling); + +#endif /* _INTEL_LRC_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_lrc_reg.h b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h new file mode 100644 index 000000000000..6bf34738b4e5 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_lrc_reg.h @@ -0,0 +1,68 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef _INTEL_LRC_REG_H_ +#define _INTEL_LRC_REG_H_ + +#include <linux/types.h> + +/* GEN8+ Reg State Context */ +#define CTX_LRI_HEADER_0 0x01 +#define CTX_CONTEXT_CONTROL 0x02 +#define CTX_RING_HEAD 0x04 +#define CTX_RING_TAIL 0x06 +#define CTX_RING_BUFFER_START 0x08 +#define CTX_RING_BUFFER_CONTROL 0x0a +#define CTX_BB_HEAD_U 0x0c +#define CTX_BB_HEAD_L 0x0e +#define CTX_BB_STATE 0x10 +#define CTX_SECOND_BB_HEAD_U 0x12 +#define CTX_SECOND_BB_HEAD_L 0x14 +#define CTX_SECOND_BB_STATE 0x16 +#define CTX_BB_PER_CTX_PTR 0x18 +#define CTX_RCS_INDIRECT_CTX 0x1a +#define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c +#define CTX_LRI_HEADER_1 0x21 +#define CTX_CTX_TIMESTAMP 0x22 +#define CTX_PDP3_UDW 0x24 +#define CTX_PDP3_LDW 0x26 +#define CTX_PDP2_UDW 0x28 +#define CTX_PDP2_LDW 0x2a +#define CTX_PDP1_UDW 0x2c +#define CTX_PDP1_LDW 0x2e +#define CTX_PDP0_UDW 0x30 +#define CTX_PDP0_LDW 0x32 +#define CTX_LRI_HEADER_2 0x41 +#define CTX_R_PWR_CLK_STATE 0x42 +#define CTX_END 0x44 + +#define CTX_REG(reg_state, pos, reg, val) do { \ + u32 *reg_state__ = (reg_state); \ + const u32 pos__ = (pos); \ + (reg_state__)[(pos__) + 0] = i915_mmio_reg_offset(reg); \ + (reg_state__)[(pos__) + 1] = (val); \ +} while (0) + +#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \ + u32 *reg_state__ = (reg_state); \ + const u64 addr__ = i915_page_dir_dma_addr((ppgtt), (n)); \ + (reg_state__)[CTX_PDP ## n ## _UDW + 1] = upper_32_bits(addr__); \ + (reg_state__)[CTX_PDP ## n ## _LDW + 1] = lower_32_bits(addr__); \ +} while (0) + +#define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \ + u32 *reg_state__ = (reg_state); \ + const u64 addr__ = px_dma(ppgtt->pd); \ + (reg_state__)[CTX_PDP0_UDW + 1] = upper_32_bits(addr__); \ + (reg_state__)[CTX_PDP0_LDW + 1] = lower_32_bits(addr__); \ +} while (0) + +#define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 +#define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 +#define GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x19 +#define GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x1A + +#endif /* _INTEL_LRC_REG_H_ */ diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c new file mode 100644 index 000000000000..1f9db50b1869 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -0,0 +1,574 @@ +/* + * Copyright (c) 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "i915_drv.h" + +#include "intel_engine.h" +#include "intel_mocs.h" +#include "intel_lrc.h" + +/* structures required */ +struct drm_i915_mocs_entry { + u32 control_value; + u16 l3cc_value; + u16 used; +}; + +struct drm_i915_mocs_table { + unsigned int size; + unsigned int n_entries; + const struct drm_i915_mocs_entry *table; +}; + +/* Defines for the tables (XXX_MOCS_0 - XXX_MOCS_63) */ +#define _LE_CACHEABILITY(value) ((value) << 0) +#define _LE_TGT_CACHE(value) ((value) << 2) +#define LE_LRUM(value) ((value) << 4) +#define LE_AOM(value) ((value) << 6) +#define LE_RSC(value) ((value) << 7) +#define LE_SCC(value) ((value) << 8) +#define LE_PFM(value) ((value) << 11) +#define LE_SCF(value) ((value) << 14) +#define LE_COS(value) ((value) << 15) +#define LE_SSE(value) ((value) << 17) + +/* Defines for the tables (LNCFMOCS0 - LNCFMOCS31) - two entries per word */ +#define L3_ESC(value) ((value) << 0) +#define L3_SCC(value) ((value) << 1) +#define _L3_CACHEABILITY(value) ((value) << 4) + +/* Helper defines */ +#define GEN9_NUM_MOCS_ENTRIES 62 /* 62 out of 64 - 63 & 64 are reserved. */ +#define GEN11_NUM_MOCS_ENTRIES 64 /* 63-64 are reserved, but configured. */ + +/* (e)LLC caching options */ +#define LE_0_PAGETABLE _LE_CACHEABILITY(0) +#define LE_1_UC _LE_CACHEABILITY(1) +#define LE_2_WT _LE_CACHEABILITY(2) +#define LE_3_WB _LE_CACHEABILITY(3) + +/* Target cache */ +#define LE_TC_0_PAGETABLE _LE_TGT_CACHE(0) +#define LE_TC_1_LLC _LE_TGT_CACHE(1) +#define LE_TC_2_LLC_ELLC _LE_TGT_CACHE(2) +#define LE_TC_3_LLC_ELLC_ALT _LE_TGT_CACHE(3) + +/* L3 caching options */ +#define L3_0_DIRECT _L3_CACHEABILITY(0) +#define L3_1_UC _L3_CACHEABILITY(1) +#define L3_2_RESERVED _L3_CACHEABILITY(2) +#define L3_3_WB _L3_CACHEABILITY(3) + +#define MOCS_ENTRY(__idx, __control_value, __l3cc_value) \ + [__idx] = { \ + .control_value = __control_value, \ + .l3cc_value = __l3cc_value, \ + .used = 1, \ + } + +/* + * MOCS tables + * + * These are the MOCS tables that are programmed across all the rings. + * The control value is programmed to all the rings that support the + * MOCS registers. While the l3cc_values are only programmed to the + * LNCFCMOCS0 - LNCFCMOCS32 registers. + * + * These tables are intended to be kept reasonably consistent across + * HW platforms, and for ICL+, be identical across OSes. To achieve + * that, for Icelake and above, list of entries is published as part + * of bspec. + * + * Entries not part of the following tables are undefined as far as + * userspace is concerned and shouldn't be relied upon. For the time + * being they will be initialized to PTE. + * + * The last two entries are reserved by the hardware. For ICL+ they + * should be initialized according to bspec and never used, for older + * platforms they should never be written to. + * + * NOTE: These tables are part of bspec and defined as part of hardware + * interface for ICL+. For older platforms, they are part of kernel + * ABI. It is expected that, for specific hardware platform, existing + * entries will remain constant and the table will only be updated by + * adding new entries, filling unused positions. + */ +#define GEN9_MOCS_ENTRIES \ + MOCS_ENTRY(I915_MOCS_UNCACHED, \ + LE_1_UC | LE_TC_2_LLC_ELLC, \ + L3_1_UC), \ + MOCS_ENTRY(I915_MOCS_PTE, \ + LE_0_PAGETABLE | LE_TC_2_LLC_ELLC | LE_LRUM(3), \ + L3_3_WB) + +static const struct drm_i915_mocs_entry skylake_mocs_table[] = { + GEN9_MOCS_ENTRIES, + MOCS_ENTRY(I915_MOCS_CACHED, + LE_3_WB | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB) +}; + +/* NOTE: the LE_TGT_CACHE is not used on Broxton */ +static const struct drm_i915_mocs_entry broxton_mocs_table[] = { + GEN9_MOCS_ENTRIES, + MOCS_ENTRY(I915_MOCS_CACHED, + LE_1_UC | LE_TC_2_LLC_ELLC | LE_LRUM(3), + L3_3_WB) +}; + +#define GEN11_MOCS_ENTRIES \ + /* Base - Uncached (Deprecated) */ \ + MOCS_ENTRY(I915_MOCS_UNCACHED, \ + LE_1_UC | LE_TC_1_LLC, \ + L3_1_UC), \ + /* Base - L3 + LeCC:PAT (Deprecated) */ \ + MOCS_ENTRY(I915_MOCS_PTE, \ + LE_0_PAGETABLE | LE_TC_1_LLC, \ + L3_3_WB), \ + /* Base - L3 + LLC */ \ + MOCS_ENTRY(2, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_3_WB), \ + /* Base - Uncached */ \ + MOCS_ENTRY(3, \ + LE_1_UC | LE_TC_1_LLC, \ + L3_1_UC), \ + /* Base - L3 */ \ + MOCS_ENTRY(4, \ + LE_1_UC | LE_TC_1_LLC, \ + L3_3_WB), \ + /* Base - LLC */ \ + MOCS_ENTRY(5, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC), \ + /* Age 0 - LLC */ \ + MOCS_ENTRY(6, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), \ + L3_1_UC), \ + /* Age 0 - L3 + LLC */ \ + MOCS_ENTRY(7, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1), \ + L3_3_WB), \ + /* Age: Don't Chg. - LLC */ \ + MOCS_ENTRY(8, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), \ + L3_1_UC), \ + /* Age: Don't Chg. - L3 + LLC */ \ + MOCS_ENTRY(9, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2), \ + L3_3_WB), \ + /* No AOM - LLC */ \ + MOCS_ENTRY(10, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM - L3 + LLC */ \ + MOCS_ENTRY(11, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_AOM(1), \ + L3_3_WB), \ + /* No AOM; Age 0 - LLC */ \ + MOCS_ENTRY(12, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM; Age 0 - L3 + LLC */ \ + MOCS_ENTRY(13, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(1) | LE_AOM(1), \ + L3_3_WB), \ + /* No AOM; Age:DC - LLC */ \ + MOCS_ENTRY(14, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ + L3_1_UC), \ + /* No AOM; Age:DC - L3 + LLC */ \ + MOCS_ENTRY(15, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(2) | LE_AOM(1), \ + L3_3_WB), \ + /* Bypass LLC - Uncached (EHL+) */ \ + MOCS_ENTRY(16, \ + LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ + L3_1_UC), \ + /* Bypass LLC - L3 (Read-Only) (EHL+) */ \ + MOCS_ENTRY(17, \ + LE_1_UC | LE_TC_1_LLC | LE_SCF(1), \ + L3_3_WB), \ + /* Self-Snoop - L3 + LLC */ \ + MOCS_ENTRY(18, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SSE(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(12.5%) */ \ + MOCS_ENTRY(19, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(7), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(25%) */ \ + MOCS_ENTRY(20, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(50%) */ \ + MOCS_ENTRY(21, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_SCC(1), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(75%) */ \ + MOCS_ENTRY(22, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(3), \ + L3_3_WB), \ + /* Skip Caching - L3 + LLC(87.5%) */ \ + MOCS_ENTRY(23, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3) | LE_RSC(1) | LE_SCC(7), \ + L3_3_WB), \ + /* HW Reserved - SW program but never use */ \ + MOCS_ENTRY(62, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC), \ + /* HW Reserved - SW program but never use */ \ + MOCS_ENTRY(63, \ + LE_3_WB | LE_TC_1_LLC | LE_LRUM(3), \ + L3_1_UC) + +static const struct drm_i915_mocs_entry icelake_mocs_table[] = { + GEN11_MOCS_ENTRIES +}; + +/** + * get_mocs_settings() + * @dev_priv: i915 device. + * @table: Output table that will be made to point at appropriate + * MOCS values for the device. + * + * This function will return the values of the MOCS table that needs to + * be programmed for the platform. It will return the values that need + * to be programmed and if they need to be programmed. + * + * Return: true if there are applicable MOCS settings for the device. + */ +static bool get_mocs_settings(struct drm_i915_private *dev_priv, + struct drm_i915_mocs_table *table) +{ + bool result = false; + + if (INTEL_GEN(dev_priv) >= 11) { + table->size = ARRAY_SIZE(icelake_mocs_table); + table->table = icelake_mocs_table; + table->n_entries = GEN11_NUM_MOCS_ENTRIES; + result = true; + } else if (IS_GEN9_BC(dev_priv) || IS_CANNONLAKE(dev_priv)) { + table->size = ARRAY_SIZE(skylake_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + table->table = skylake_mocs_table; + result = true; + } else if (IS_GEN9_LP(dev_priv)) { + table->size = ARRAY_SIZE(broxton_mocs_table); + table->n_entries = GEN9_NUM_MOCS_ENTRIES; + table->table = broxton_mocs_table; + result = true; + } else { + WARN_ONCE(INTEL_GEN(dev_priv) >= 9, + "Platform that should have a MOCS table does not.\n"); + } + + /* WaDisableSkipCaching:skl,bxt,kbl,glk */ + if (IS_GEN(dev_priv, 9)) { + int i; + + for (i = 0; i < table->size; i++) + if (WARN_ON(table->table[i].l3cc_value & + (L3_ESC(1) | L3_SCC(0x7)))) + return false; + } + + return result; +} + +static i915_reg_t mocs_register(enum intel_engine_id engine_id, int index) +{ + switch (engine_id) { + case RCS0: + return GEN9_GFX_MOCS(index); + case VCS0: + return GEN9_MFX0_MOCS(index); + case BCS0: + return GEN9_BLT_MOCS(index); + case VECS0: + return GEN9_VEBOX_MOCS(index); + case VCS1: + return GEN9_MFX1_MOCS(index); + case VCS2: + return GEN11_MFX2_MOCS(index); + default: + MISSING_CASE(engine_id); + return INVALID_MMIO_REG; + } +} + +/* + * Get control_value from MOCS entry taking into account when it's not used: + * I915_MOCS_PTE's value is returned in this case. + */ +static u32 get_entry_control(const struct drm_i915_mocs_table *table, + unsigned int index) +{ + if (table->table[index].used) + return table->table[index].control_value; + + return table->table[I915_MOCS_PTE].control_value; +} + +/** + * intel_mocs_init_engine() - emit the mocs control table + * @engine: The engine for whom to emit the registers. + * + * This function simply emits a MI_LOAD_REGISTER_IMM command for the + * given table starting at the given address. + */ +void intel_mocs_init_engine(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + struct drm_i915_mocs_table table; + unsigned int index; + u32 unused_value; + + if (!get_mocs_settings(dev_priv, &table)) + return; + + /* Set unused values to PTE */ + unused_value = table.table[I915_MOCS_PTE].control_value; + + for (index = 0; index < table.size; index++) { + u32 value = get_entry_control(&table, index); + + I915_WRITE(mocs_register(engine->id, index), value); + } + + /* All remaining entries are also unused */ + for (; index < table.n_entries; index++) + I915_WRITE(mocs_register(engine->id, index), unused_value); +} + +/** + * emit_mocs_control_table() - emit the mocs control table + * @rq: Request to set up the MOCS table for. + * @table: The values to program into the control regs. + * + * This function simply emits a MI_LOAD_REGISTER_IMM command for the + * given table starting at the given address. + * + * Return: 0 on success, otherwise the error status. + */ +static int emit_mocs_control_table(struct i915_request *rq, + const struct drm_i915_mocs_table *table) +{ + enum intel_engine_id engine = rq->engine->id; + unsigned int index; + u32 unused_value; + u32 *cs; + + if (GEM_WARN_ON(table->size > table->n_entries)) + return -ENODEV; + + /* Set unused values to PTE */ + unused_value = table->table[I915_MOCS_PTE].control_value; + + cs = intel_ring_begin(rq, 2 + 2 * table->n_entries); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries); + + for (index = 0; index < table->size; index++) { + u32 value = get_entry_control(table, index); + + *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); + *cs++ = value; + } + + /* All remaining entries are also unused */ + for (; index < table->n_entries; index++) { + *cs++ = i915_mmio_reg_offset(mocs_register(engine, index)); + *cs++ = unused_value; + } + + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +/* + * Get l3cc_value from MOCS entry taking into account when it's not used: + * I915_MOCS_PTE's value is returned in this case. + */ +static u16 get_entry_l3cc(const struct drm_i915_mocs_table *table, + unsigned int index) +{ + if (table->table[index].used) + return table->table[index].l3cc_value; + + return table->table[I915_MOCS_PTE].l3cc_value; +} + +static inline u32 l3cc_combine(const struct drm_i915_mocs_table *table, + u16 low, + u16 high) +{ + return low | high << 16; +} + +/** + * emit_mocs_l3cc_table() - emit the mocs control table + * @rq: Request to set up the MOCS table for. + * @table: The values to program into the control regs. + * + * This function simply emits a MI_LOAD_REGISTER_IMM command for the + * given table starting at the given address. This register set is + * programmed in pairs. + * + * Return: 0 on success, otherwise the error status. + */ +static int emit_mocs_l3cc_table(struct i915_request *rq, + const struct drm_i915_mocs_table *table) +{ + u16 unused_value; + unsigned int i; + u32 *cs; + + if (GEM_WARN_ON(table->size > table->n_entries)) + return -ENODEV; + + /* Set unused values to PTE */ + unused_value = table->table[I915_MOCS_PTE].l3cc_value; + + cs = intel_ring_begin(rq, 2 + table->n_entries); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries / 2); + + for (i = 0; i < table->size / 2; i++) { + u16 low = get_entry_l3cc(table, 2 * i); + u16 high = get_entry_l3cc(table, 2 * i + 1); + + *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); + *cs++ = l3cc_combine(table, low, high); + } + + /* Odd table size - 1 left over */ + if (table->size & 0x01) { + u16 low = get_entry_l3cc(table, 2 * i); + + *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); + *cs++ = l3cc_combine(table, low, unused_value); + i++; + } + + /* All remaining entries are also unused */ + for (; i < table->n_entries / 2; i++) { + *cs++ = i915_mmio_reg_offset(GEN9_LNCFCMOCS(i)); + *cs++ = l3cc_combine(table, unused_value, unused_value); + } + + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +/** + * intel_mocs_init_l3cc_table() - program the mocs control table + * @dev_priv: i915 device private + * + * This function simply programs the mocs registers for the given table + * starting at the given address. This register set is programmed in pairs. + * + * These registers may get programmed more than once, it is simpler to + * re-program 32 registers than maintain the state of when they were programmed. + * We are always reprogramming with the same values and this only on context + * start. + * + * Return: Nothing. + */ +void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_priv) +{ + struct drm_i915_mocs_table table; + unsigned int i; + u16 unused_value; + + if (!get_mocs_settings(dev_priv, &table)) + return; + + /* Set unused values to PTE */ + unused_value = table.table[I915_MOCS_PTE].l3cc_value; + + for (i = 0; i < table.size / 2; i++) { + u16 low = get_entry_l3cc(&table, 2 * i); + u16 high = get_entry_l3cc(&table, 2 * i + 1); + + I915_WRITE(GEN9_LNCFCMOCS(i), + l3cc_combine(&table, low, high)); + } + + /* Odd table size - 1 left over */ + if (table.size & 0x01) { + u16 low = get_entry_l3cc(&table, 2 * i); + + I915_WRITE(GEN9_LNCFCMOCS(i), + l3cc_combine(&table, low, unused_value)); + i++; + } + + /* All remaining entries are also unused */ + for (; i < table.n_entries / 2; i++) + I915_WRITE(GEN9_LNCFCMOCS(i), + l3cc_combine(&table, unused_value, unused_value)); +} + +/** + * intel_rcs_context_init_mocs() - program the MOCS register. + * @rq: Request to set up the MOCS tables for. + * + * This function will emit a batch buffer with the values required for + * programming the MOCS register values for all the currently supported + * rings. + * + * These registers are partially stored in the RCS context, so they are + * emitted at the same time so that when a context is created these registers + * are set up. These registers have to be emitted into the start of the + * context as setting the ELSP will re-init some of these registers back + * to the hw values. + * + * Return: 0 on success, otherwise the error status. + */ +int intel_rcs_context_init_mocs(struct i915_request *rq) +{ + struct drm_i915_mocs_table t; + int ret; + + if (get_mocs_settings(rq->i915, &t)) { + /* Program the RCS control registers */ + ret = emit_mocs_control_table(rq, &t); + if (ret) + return ret; + + /* Now program the l3cc registers */ + ret = emit_mocs_l3cc_table(rq, &t); + if (ret) + return ret; + } + + return 0; +} diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.h b/drivers/gpu/drm/i915/gt/intel_mocs.h new file mode 100644 index 000000000000..0913704a1af2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_mocs.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INTEL_MOCS_H +#define INTEL_MOCS_H + +/** + * DOC: Memory Objects Control State (MOCS) + * + * Motivation: + * In previous Gens the MOCS settings was a value that was set by user land as + * part of the batch. In Gen9 this has changed to be a single table (per ring) + * that all batches now reference by index instead of programming the MOCS + * directly. + * + * The one wrinkle in this is that only PART of the MOCS tables are included + * in context (The GFX_MOCS_0 - GFX_MOCS_64 and the LNCFCMOCS0 - LNCFCMOCS32 + * registers). The rest are not (the settings for the other rings). + * + * This table needs to be set at system start-up because the way the table + * interacts with the contexts and the GmmLib interface. + * + * + * Implementation: + * + * The tables (one per supported platform) are defined in intel_mocs.c + * and are programmed in the first batch after the context is loaded + * (with the hardware workarounds). This will then let the usual + * context handling keep the MOCS in step. + */ + +struct drm_i915_private; +struct i915_request; +struct intel_engine_cs; + +int intel_rcs_context_init_mocs(struct i915_request *rq); +void intel_mocs_init_l3cc_table(struct drm_i915_private *dev_priv); +void intel_mocs_init_engine(struct intel_engine_cs *engine); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c new file mode 100644 index 000000000000..4c478b38e420 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -0,0 +1,1470 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2008-2018 Intel Corporation + */ + +#include <linux/sched/mm.h> +#include <linux/stop_machine.h> + +#include "display/intel_overlay.h" + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" +#include "i915_gpu_error.h" +#include "i915_irq.h" +#include "intel_engine_pm.h" +#include "intel_gt_pm.h" +#include "intel_reset.h" + +#include "intel_guc.h" + +#define RESET_MAX_RETRIES 3 + +/* XXX How to handle concurrent GGTT updates using tiling registers? */ +#define RESET_UNDER_STOP_MACHINE 0 + +static void rmw_set(struct intel_uncore *uncore, i915_reg_t reg, u32 set) +{ + intel_uncore_rmw(uncore, reg, 0, set); +} + +static void rmw_clear(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) +{ + intel_uncore_rmw(uncore, reg, clr, 0); +} + +static void rmw_set_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 set) +{ + intel_uncore_rmw_fw(uncore, reg, 0, set); +} + +static void rmw_clear_fw(struct intel_uncore *uncore, i915_reg_t reg, u32 clr) +{ + intel_uncore_rmw_fw(uncore, reg, clr, 0); +} + +static void engine_skip_context(struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + struct i915_gem_context *hung_ctx = rq->gem_context; + + lockdep_assert_held(&engine->active.lock); + + if (!i915_request_is_active(rq)) + return; + + list_for_each_entry_continue(rq, &engine->active.requests, sched.link) + if (rq->gem_context == hung_ctx) + i915_request_skip(rq, -EIO); +} + +static void client_mark_guilty(struct drm_i915_file_private *file_priv, + const struct i915_gem_context *ctx) +{ + unsigned int score; + unsigned long prev_hang; + + if (i915_gem_context_is_banned(ctx)) + score = I915_CLIENT_SCORE_CONTEXT_BAN; + else + score = 0; + + prev_hang = xchg(&file_priv->hang_timestamp, jiffies); + if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES)) + score += I915_CLIENT_SCORE_HANG_FAST; + + if (score) { + atomic_add(score, &file_priv->ban_score); + + DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n", + ctx->name, score, + atomic_read(&file_priv->ban_score)); + } +} + +static bool context_mark_guilty(struct i915_gem_context *ctx) +{ + unsigned long prev_hang; + bool banned; + int i; + + atomic_inc(&ctx->guilty_count); + + /* Cool contexts are too cool to be banned! (Used for reset testing.) */ + if (!i915_gem_context_is_bannable(ctx)) + return false; + + /* Record the timestamp for the last N hangs */ + prev_hang = ctx->hang_timestamp[0]; + for (i = 0; i < ARRAY_SIZE(ctx->hang_timestamp) - 1; i++) + ctx->hang_timestamp[i] = ctx->hang_timestamp[i + 1]; + ctx->hang_timestamp[i] = jiffies; + + /* If we have hung N+1 times in rapid succession, we ban the context! */ + banned = !i915_gem_context_is_recoverable(ctx); + if (time_before(jiffies, prev_hang + CONTEXT_FAST_HANG_JIFFIES)) + banned = true; + if (banned) { + DRM_DEBUG_DRIVER("context %s: guilty %d, banned\n", + ctx->name, atomic_read(&ctx->guilty_count)); + i915_gem_context_set_banned(ctx); + } + + if (!IS_ERR_OR_NULL(ctx->file_priv)) + client_mark_guilty(ctx->file_priv, ctx); + + return banned; +} + +static void context_mark_innocent(struct i915_gem_context *ctx) +{ + atomic_inc(&ctx->active_count); +} + +void i915_reset_request(struct i915_request *rq, bool guilty) +{ + GEM_TRACE("%s rq=%llx:%lld, guilty? %s\n", + rq->engine->name, + rq->fence.context, + rq->fence.seqno, + yesno(guilty)); + + lockdep_assert_held(&rq->engine->active.lock); + GEM_BUG_ON(i915_request_completed(rq)); + + if (guilty) { + i915_request_skip(rq, -EIO); + if (context_mark_guilty(rq->gem_context)) + engine_skip_context(rq); + } else { + dma_fence_set_error(&rq->fence, -EAGAIN); + context_mark_innocent(rq->gem_context); + } +} + +static void gen3_stop_engine(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + + GEM_TRACE("%s\n", engine->name); + + if (intel_engine_stop_cs(engine)) + GEM_TRACE("%s: timed out on STOP_RING\n", engine->name); + + intel_uncore_write_fw(uncore, + RING_HEAD(base), + intel_uncore_read_fw(uncore, RING_TAIL(base))); + intel_uncore_posting_read_fw(uncore, RING_HEAD(base)); /* paranoia */ + + intel_uncore_write_fw(uncore, RING_HEAD(base), 0); + intel_uncore_write_fw(uncore, RING_TAIL(base), 0); + intel_uncore_posting_read_fw(uncore, RING_TAIL(base)); + + /* The ring must be empty before it is disabled */ + intel_uncore_write_fw(uncore, RING_CTL(base), 0); + + /* Check acts as a post */ + if (intel_uncore_read_fw(uncore, RING_HEAD(base))) + GEM_TRACE("%s: ring head [%x] not parked\n", + engine->name, + intel_uncore_read_fw(uncore, RING_HEAD(base))); +} + +static void i915_stop_engines(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask) +{ + struct intel_engine_cs *engine; + intel_engine_mask_t tmp; + + if (INTEL_GEN(i915) < 3) + return; + + for_each_engine_masked(engine, i915, engine_mask, tmp) + gen3_stop_engine(engine); +} + +static bool i915_in_reset(struct pci_dev *pdev) +{ + u8 gdrst; + + pci_read_config_byte(pdev, I915_GDRST, &gdrst); + return gdrst & GRDOM_RESET_STATUS; +} + +static int i915_do_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = i915->drm.pdev; + int err; + + /* Assert reset for at least 20 usec, and wait for acknowledgement. */ + pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); + udelay(50); + err = wait_for_atomic(i915_in_reset(pdev), 50); + + /* Clear the reset request. */ + pci_write_config_byte(pdev, I915_GDRST, 0); + udelay(50); + if (!err) + err = wait_for_atomic(!i915_in_reset(pdev), 50); + + return err; +} + +static bool g4x_reset_complete(struct pci_dev *pdev) +{ + u8 gdrst; + + pci_read_config_byte(pdev, I915_GDRST, &gdrst); + return (gdrst & GRDOM_RESET_ENABLE) == 0; +} + +static int g33_do_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = i915->drm.pdev; + + pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE); + return wait_for_atomic(g4x_reset_complete(pdev), 50); +} + +static int g4x_do_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct pci_dev *pdev = i915->drm.pdev; + struct intel_uncore *uncore = &i915->uncore; + int ret; + + /* WaVcpClkGateDisableForMediaReset:ctg,elk */ + rmw_set_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); + + pci_write_config_byte(pdev, I915_GDRST, + GRDOM_MEDIA | GRDOM_RESET_ENABLE); + ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + if (ret) { + DRM_DEBUG_DRIVER("Wait for media reset failed\n"); + goto out; + } + + pci_write_config_byte(pdev, I915_GDRST, + GRDOM_RENDER | GRDOM_RESET_ENABLE); + ret = wait_for_atomic(g4x_reset_complete(pdev), 50); + if (ret) { + DRM_DEBUG_DRIVER("Wait for render reset failed\n"); + goto out; + } + +out: + pci_write_config_byte(pdev, I915_GDRST, 0); + + rmw_clear_fw(uncore, VDECCLK_GATE_D, VCP_UNIT_CLOCK_GATE_DISABLE); + intel_uncore_posting_read_fw(uncore, VDECCLK_GATE_D); + + return ret; +} + +static int ironlake_do_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct intel_uncore *uncore = &i915->uncore; + int ret; + + intel_uncore_write_fw(uncore, ILK_GDSR, + ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE); + ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, + ILK_GRDOM_RESET_ENABLE, 0, + 5000, 0, + NULL); + if (ret) { + DRM_DEBUG_DRIVER("Wait for render reset failed\n"); + goto out; + } + + intel_uncore_write_fw(uncore, ILK_GDSR, + ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE); + ret = __intel_wait_for_register_fw(uncore, ILK_GDSR, + ILK_GRDOM_RESET_ENABLE, 0, + 5000, 0, + NULL); + if (ret) { + DRM_DEBUG_DRIVER("Wait for media reset failed\n"); + goto out; + } + +out: + intel_uncore_write_fw(uncore, ILK_GDSR, 0); + intel_uncore_posting_read_fw(uncore, ILK_GDSR); + return ret; +} + +/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */ +static int gen6_hw_domain_reset(struct drm_i915_private *i915, + u32 hw_domain_mask) +{ + struct intel_uncore *uncore = &i915->uncore; + int err; + + /* + * GEN6_GDRST is not in the gt power well, no need to check + * for fifo space for the write or forcewake the chip for + * the read + */ + intel_uncore_write_fw(uncore, GEN6_GDRST, hw_domain_mask); + + /* Wait for the device to ack the reset requests */ + err = __intel_wait_for_register_fw(uncore, + GEN6_GDRST, hw_domain_mask, 0, + 500, 0, + NULL); + if (err) + DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n", + hw_domain_mask); + + return err; +} + +static int gen6_reset_engines(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct intel_engine_cs *engine; + const u32 hw_engine_mask[] = { + [RCS0] = GEN6_GRDOM_RENDER, + [BCS0] = GEN6_GRDOM_BLT, + [VCS0] = GEN6_GRDOM_MEDIA, + [VCS1] = GEN8_GRDOM_MEDIA2, + [VECS0] = GEN6_GRDOM_VECS, + }; + u32 hw_mask; + + if (engine_mask == ALL_ENGINES) { + hw_mask = GEN6_GRDOM_FULL; + } else { + intel_engine_mask_t tmp; + + hw_mask = 0; + for_each_engine_masked(engine, i915, engine_mask, tmp) { + GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); + hw_mask |= hw_engine_mask[engine->id]; + } + } + + return gen6_hw_domain_reset(i915, hw_mask); +} + +static u32 gen11_lock_sfc(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; + i915_reg_t sfc_forced_lock, sfc_forced_lock_ack; + u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit; + i915_reg_t sfc_usage; + u32 sfc_usage_bit; + u32 sfc_reset_bit; + + switch (engine->class) { + case VIDEO_DECODE_CLASS: + if ((BIT(engine->instance) & vdbox_sfc_access) == 0) + return 0; + + sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; + + sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine); + sfc_forced_lock_ack_bit = GEN11_VCS_SFC_LOCK_ACK_BIT; + + sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine); + sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT; + sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance); + break; + + case VIDEO_ENHANCEMENT_CLASS: + sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + + sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine); + sfc_forced_lock_ack_bit = GEN11_VECS_SFC_LOCK_ACK_BIT; + + sfc_usage = GEN11_VECS_SFC_USAGE(engine); + sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT; + sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance); + break; + + default: + return 0; + } + + /* + * Tell the engine that a software reset is going to happen. The engine + * will then try to force lock the SFC (if currently locked, it will + * remain so until we tell the engine it is safe to unlock; if currently + * unlocked, it will ignore this and all new lock requests). If SFC + * ends up being locked to the engine we want to reset, we have to reset + * it as well (we will unlock it once the reset sequence is completed). + */ + rmw_set_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); + + if (__intel_wait_for_register_fw(uncore, + sfc_forced_lock_ack, + sfc_forced_lock_ack_bit, + sfc_forced_lock_ack_bit, + 1000, 0, NULL)) { + DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n"); + return 0; + } + + if (intel_uncore_read_fw(uncore, sfc_usage) & sfc_usage_bit) + return sfc_reset_bit; + + return 0; +} + +static void gen11_unlock_sfc(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + u8 vdbox_sfc_access = RUNTIME_INFO(engine->i915)->vdbox_sfc_access; + i915_reg_t sfc_forced_lock; + u32 sfc_forced_lock_bit; + + switch (engine->class) { + case VIDEO_DECODE_CLASS: + if ((BIT(engine->instance) & vdbox_sfc_access) == 0) + return; + + sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT; + break; + + case VIDEO_ENHANCEMENT_CLASS: + sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine); + sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT; + break; + + default: + return; + } + + rmw_clear_fw(uncore, sfc_forced_lock, sfc_forced_lock_bit); +} + +static int gen11_reset_engines(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + const u32 hw_engine_mask[] = { + [RCS0] = GEN11_GRDOM_RENDER, + [BCS0] = GEN11_GRDOM_BLT, + [VCS0] = GEN11_GRDOM_MEDIA, + [VCS1] = GEN11_GRDOM_MEDIA2, + [VCS2] = GEN11_GRDOM_MEDIA3, + [VCS3] = GEN11_GRDOM_MEDIA4, + [VECS0] = GEN11_GRDOM_VECS, + [VECS1] = GEN11_GRDOM_VECS2, + }; + struct intel_engine_cs *engine; + intel_engine_mask_t tmp; + u32 hw_mask; + int ret; + + if (engine_mask == ALL_ENGINES) { + hw_mask = GEN11_GRDOM_FULL; + } else { + hw_mask = 0; + for_each_engine_masked(engine, i915, engine_mask, tmp) { + GEM_BUG_ON(engine->id >= ARRAY_SIZE(hw_engine_mask)); + hw_mask |= hw_engine_mask[engine->id]; + hw_mask |= gen11_lock_sfc(engine); + } + } + + ret = gen6_hw_domain_reset(i915, hw_mask); + + if (engine_mask != ALL_ENGINES) + for_each_engine_masked(engine, i915, engine_mask, tmp) + gen11_unlock_sfc(engine); + + return ret; +} + +static int gen8_engine_reset_prepare(struct intel_engine_cs *engine) +{ + struct intel_uncore *uncore = engine->uncore; + const i915_reg_t reg = RING_RESET_CTL(engine->mmio_base); + u32 request, mask, ack; + int ret; + + ack = intel_uncore_read_fw(uncore, reg); + if (ack & RESET_CTL_CAT_ERROR) { + /* + * For catastrophic errors, ready-for-reset sequence + * needs to be bypassed: HAS#396813 + */ + request = RESET_CTL_CAT_ERROR; + mask = RESET_CTL_CAT_ERROR; + + /* Catastrophic errors need to be cleared by HW */ + ack = 0; + } else if (!(ack & RESET_CTL_READY_TO_RESET)) { + request = RESET_CTL_REQUEST_RESET; + mask = RESET_CTL_READY_TO_RESET; + ack = RESET_CTL_READY_TO_RESET; + } else { + return 0; + } + + intel_uncore_write_fw(uncore, reg, _MASKED_BIT_ENABLE(request)); + ret = __intel_wait_for_register_fw(uncore, reg, mask, ack, + 700, 0, NULL); + if (ret) + DRM_ERROR("%s reset request timed out: {request: %08x, RESET_CTL: %08x}\n", + engine->name, request, + intel_uncore_read_fw(uncore, reg)); + + return ret; +} + +static void gen8_engine_reset_cancel(struct intel_engine_cs *engine) +{ + intel_uncore_write_fw(engine->uncore, + RING_RESET_CTL(engine->mmio_base), + _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)); +} + +static int gen8_reset_engines(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned int retry) +{ + struct intel_engine_cs *engine; + const bool reset_non_ready = retry >= 1; + intel_engine_mask_t tmp; + int ret; + + for_each_engine_masked(engine, i915, engine_mask, tmp) { + ret = gen8_engine_reset_prepare(engine); + if (ret && !reset_non_ready) + goto skip_reset; + + /* + * If this is not the first failed attempt to prepare, + * we decide to proceed anyway. + * + * By doing so we risk context corruption and with + * some gens (kbl), possible system hang if reset + * happens during active bb execution. + * + * We rather take context corruption instead of + * failed reset with a wedged driver/gpu. And + * active bb execution case should be covered by + * i915_stop_engines we have before the reset. + */ + } + + if (INTEL_GEN(i915) >= 11) + ret = gen11_reset_engines(i915, engine_mask, retry); + else + ret = gen6_reset_engines(i915, engine_mask, retry); + +skip_reset: + for_each_engine_masked(engine, i915, engine_mask, tmp) + gen8_engine_reset_cancel(engine); + + return ret; +} + +typedef int (*reset_func)(struct drm_i915_private *, + intel_engine_mask_t engine_mask, + unsigned int retry); + +static reset_func intel_get_gpu_reset(struct drm_i915_private *i915) +{ + if (INTEL_GEN(i915) >= 8) + return gen8_reset_engines; + else if (INTEL_GEN(i915) >= 6) + return gen6_reset_engines; + else if (INTEL_GEN(i915) >= 5) + return ironlake_do_reset; + else if (IS_G4X(i915)) + return g4x_do_reset; + else if (IS_G33(i915) || IS_PINEVIEW(i915)) + return g33_do_reset; + else if (INTEL_GEN(i915) >= 3) + return i915_do_reset; + else + return NULL; +} + +int intel_gpu_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask) +{ + const int retries = engine_mask == ALL_ENGINES ? RESET_MAX_RETRIES : 1; + reset_func reset; + int ret = -ETIMEDOUT; + int retry; + + reset = intel_get_gpu_reset(i915); + if (!reset) + return -ENODEV; + + /* + * If the power well sleeps during the reset, the reset + * request may be dropped and never completes (causing -EIO). + */ + intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); + for (retry = 0; ret == -ETIMEDOUT && retry < retries; retry++) { + /* + * We stop engines, otherwise we might get failed reset and a + * dead gpu (on elk). Also as modern gpu as kbl can suffer + * from system hang if batchbuffer is progressing when + * the reset is issued, regardless of READY_TO_RESET ack. + * Thus assume it is best to stop engines on all gens + * where we have a gpu reset. + * + * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES) + * + * WaMediaResetMainRingCleanup:ctg,elk (presumably) + * + * FIXME: Wa for more modern gens needs to be validated + */ + if (retry) + i915_stop_engines(i915, engine_mask); + + GEM_TRACE("engine_mask=%x\n", engine_mask); + preempt_disable(); + ret = reset(i915, engine_mask, retry); + preempt_enable(); + } + intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); + + return ret; +} + +bool intel_has_gpu_reset(struct drm_i915_private *i915) +{ + if (!i915_modparams.reset) + return NULL; + + return intel_get_gpu_reset(i915); +} + +bool intel_has_reset_engine(struct drm_i915_private *i915) +{ + return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2; +} + +int intel_reset_guc(struct drm_i915_private *i915) +{ + u32 guc_domain = + INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC; + int ret; + + GEM_BUG_ON(!HAS_GUC(i915)); + + intel_uncore_forcewake_get(&i915->uncore, FORCEWAKE_ALL); + ret = gen6_hw_domain_reset(i915, guc_domain); + intel_uncore_forcewake_put(&i915->uncore, FORCEWAKE_ALL); + + return ret; +} + +/* + * Ensure irq handler finishes, and not run again. + * Also return the active request so that we only search for it once. + */ +static void reset_prepare_engine(struct intel_engine_cs *engine) +{ + /* + * During the reset sequence, we must prevent the engine from + * entering RC6. As the context state is undefined until we restart + * the engine, if it does enter RC6 during the reset, the state + * written to the powercontext is undefined and so we may lose + * GPU state upon resume, i.e. fail to restart after a reset. + */ + intel_engine_pm_get(engine); + intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); + engine->reset.prepare(engine); +} + +static void revoke_mmaps(struct drm_i915_private *i915) +{ + int i; + + for (i = 0; i < i915->ggtt.num_fences; i++) { + struct drm_vma_offset_node *node; + struct i915_vma *vma; + u64 vma_offset; + + vma = READ_ONCE(i915->ggtt.fence_regs[i].vma); + if (!vma) + continue; + + if (!i915_vma_has_userfault(vma)) + continue; + + GEM_BUG_ON(vma->fence != &i915->ggtt.fence_regs[i]); + node = &vma->obj->base.vma_node; + vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT; + unmap_mapping_range(i915->drm.anon_inode->i_mapping, + drm_vma_node_offset_addr(node) + vma_offset, + vma->size, + 1); + } +} + +static void reset_prepare(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + intel_gt_pm_get(i915); + for_each_engine(engine, i915, id) + reset_prepare_engine(engine); + + intel_uc_reset_prepare(i915); +} + +static void gt_revoke(struct drm_i915_private *i915) +{ + revoke_mmaps(i915); +} + +static int gt_reset(struct drm_i915_private *i915, + intel_engine_mask_t stalled_mask) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + int err; + + /* + * Everything depends on having the GTT running, so we need to start + * there. + */ + err = i915_ggtt_enable_hw(i915); + if (err) + return err; + + for_each_engine(engine, i915, id) + intel_engine_reset(engine, stalled_mask & engine->mask); + + i915_gem_restore_fences(i915); + + return err; +} + +static void reset_finish_engine(struct intel_engine_cs *engine) +{ + engine->reset.finish(engine); + intel_engine_pm_put(engine); + intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); +} + +static void reset_finish(struct drm_i915_private *i915) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + reset_finish_engine(engine); + intel_engine_signal_breadcrumbs(engine); + } + intel_gt_pm_put(i915); +} + +static void nop_submit_request(struct i915_request *request) +{ + struct intel_engine_cs *engine = request->engine; + unsigned long flags; + + GEM_TRACE("%s fence %llx:%lld -> -EIO\n", + engine->name, request->fence.context, request->fence.seqno); + dma_fence_set_error(&request->fence, -EIO); + + spin_lock_irqsave(&engine->active.lock, flags); + __i915_request_submit(request); + i915_request_mark_complete(request); + spin_unlock_irqrestore(&engine->active.lock, flags); + + intel_engine_queue_breadcrumbs(engine); +} + +static void __i915_gem_set_wedged(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + struct intel_engine_cs *engine; + enum intel_engine_id id; + + if (test_bit(I915_WEDGED, &error->flags)) + return; + + if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) { + struct drm_printer p = drm_debug_printer(__func__); + + for_each_engine(engine, i915, id) + intel_engine_dump(engine, &p, "%s\n", engine->name); + } + + GEM_TRACE("start\n"); + + /* + * First, stop submission to hw, but do not yet complete requests by + * rolling the global seqno forward (since this would complete requests + * for which we haven't set the fence error to EIO yet). + */ + reset_prepare(i915); + + /* Even if the GPU reset fails, it should still stop the engines */ + if (!INTEL_INFO(i915)->gpu_reset_clobbers_display) + intel_gpu_reset(i915, ALL_ENGINES); + + for_each_engine(engine, i915, id) { + engine->submit_request = nop_submit_request; + engine->schedule = NULL; + } + i915->caps.scheduler = 0; + + /* + * Make sure no request can slip through without getting completed by + * either this call here to intel_engine_write_global_seqno, or the one + * in nop_submit_request. + */ + synchronize_rcu_expedited(); + set_bit(I915_WEDGED, &error->flags); + + /* Mark all executing requests as skipped */ + for_each_engine(engine, i915, id) + engine->cancel_requests(engine); + + reset_finish(i915); + + GEM_TRACE("end\n"); +} + +void i915_gem_set_wedged(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + intel_wakeref_t wakeref; + + mutex_lock(&error->wedge_mutex); + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + __i915_gem_set_wedged(i915); + mutex_unlock(&error->wedge_mutex); +} + +static bool __i915_gem_unset_wedged(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + struct i915_timeline *tl; + + if (!test_bit(I915_WEDGED, &error->flags)) + return true; + + if (!i915->gt.scratch) /* Never full initialised, recovery impossible */ + return false; + + GEM_TRACE("start\n"); + + /* + * Before unwedging, make sure that all pending operations + * are flushed and errored out - we may have requests waiting upon + * third party fences. We marked all inflight requests as EIO, and + * every execbuf since returned EIO, for consistency we want all + * the currently pending requests to also be marked as EIO, which + * is done inside our nop_submit_request - and so we must wait. + * + * No more can be submitted until we reset the wedged bit. + */ + mutex_lock(&i915->gt.timelines.mutex); + list_for_each_entry(tl, &i915->gt.timelines.active_list, link) { + struct i915_request *rq; + + rq = i915_active_request_get_unlocked(&tl->last_request); + if (!rq) + continue; + + /* + * All internal dependencies (i915_requests) will have + * been flushed by the set-wedge, but we may be stuck waiting + * for external fences. These should all be capped to 10s + * (I915_FENCE_TIMEOUT) so this wait should not be unbounded + * in the worst case. + */ + dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT); + i915_request_put(rq); + } + mutex_unlock(&i915->gt.timelines.mutex); + + intel_gt_sanitize(i915, false); + + /* + * Undo nop_submit_request. We prevent all new i915 requests from + * being queued (by disallowing execbuf whilst wedged) so having + * waited for all active requests above, we know the system is idle + * and do not have to worry about a thread being inside + * engine->submit_request() as we swap over. So unlike installing + * the nop_submit_request on reset, we can do this from normal + * context and do not require stop_machine(). + */ + intel_engines_reset_default_submission(i915); + + GEM_TRACE("end\n"); + + smp_mb__before_atomic(); /* complete takeover before enabling execbuf */ + clear_bit(I915_WEDGED, &i915->gpu_error.flags); + + return true; +} + +bool i915_gem_unset_wedged(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + bool result; + + mutex_lock(&error->wedge_mutex); + result = __i915_gem_unset_wedged(i915); + mutex_unlock(&error->wedge_mutex); + + return result; +} + +static int do_reset(struct drm_i915_private *i915, + intel_engine_mask_t stalled_mask) +{ + int err, i; + + gt_revoke(i915); + + err = intel_gpu_reset(i915, ALL_ENGINES); + for (i = 0; err && i < RESET_MAX_RETRIES; i++) { + msleep(10 * (i + 1)); + err = intel_gpu_reset(i915, ALL_ENGINES); + } + if (err) + return err; + + return gt_reset(i915, stalled_mask); +} + +/** + * i915_reset - reset chip after a hang + * @i915: #drm_i915_private to reset + * @stalled_mask: mask of the stalled engines with the guilty requests + * @reason: user error message for why we are resetting + * + * Reset the chip. Useful if a hang is detected. Marks the device as wedged + * on failure. + * + * Procedure is fairly simple: + * - reset the chip using the reset reg + * - re-init context state + * - re-init hardware status page + * - re-init ring buffer + * - re-init interrupt state + * - re-init display + */ +void i915_reset(struct drm_i915_private *i915, + intel_engine_mask_t stalled_mask, + const char *reason) +{ + struct i915_gpu_error *error = &i915->gpu_error; + int ret; + + GEM_TRACE("flags=%lx\n", error->flags); + + might_sleep(); + GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); + mutex_lock(&error->wedge_mutex); + + /* Clear any previous failed attempts at recovery. Time to try again. */ + if (!__i915_gem_unset_wedged(i915)) + goto unlock; + + if (reason) + dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason); + error->reset_count++; + + reset_prepare(i915); + + if (!intel_has_gpu_reset(i915)) { + if (i915_modparams.reset) + dev_err(i915->drm.dev, "GPU reset not supported\n"); + else + DRM_DEBUG_DRIVER("GPU reset disabled\n"); + goto error; + } + + if (INTEL_INFO(i915)->gpu_reset_clobbers_display) + intel_runtime_pm_disable_interrupts(i915); + + if (do_reset(i915, stalled_mask)) { + dev_err(i915->drm.dev, "Failed to reset chip\n"); + goto taint; + } + + if (INTEL_INFO(i915)->gpu_reset_clobbers_display) + intel_runtime_pm_enable_interrupts(i915); + + intel_overlay_reset(i915); + + /* + * Next we need to restore the context, but we don't use those + * yet either... + * + * Ring buffer needs to be re-initialized in the KMS case, or if X + * was running at the time of the reset (i.e. we weren't VT + * switched away). + */ + ret = i915_gem_init_hw(i915); + if (ret) { + DRM_ERROR("Failed to initialise HW following reset (%d)\n", + ret); + goto error; + } + + i915_queue_hangcheck(i915); + +finish: + reset_finish(i915); +unlock: + mutex_unlock(&error->wedge_mutex); + return; + +taint: + /* + * History tells us that if we cannot reset the GPU now, we + * never will. This then impacts everything that is run + * subsequently. On failing the reset, we mark the driver + * as wedged, preventing further execution on the GPU. + * We also want to go one step further and add a taint to the + * kernel so that any subsequent faults can be traced back to + * this failure. This is important for CI, where if the + * GPU/driver fails we would like to reboot and restart testing + * rather than continue on into oblivion. For everyone else, + * the system should still plod along, but they have been warned! + */ + add_taint_for_CI(TAINT_WARN); +error: + __i915_gem_set_wedged(i915); + goto finish; +} + +static inline int intel_gt_reset_engine(struct drm_i915_private *i915, + struct intel_engine_cs *engine) +{ + return intel_gpu_reset(i915, engine->mask); +} + +/** + * i915_reset_engine - reset GPU engine to recover from a hang + * @engine: engine to reset + * @msg: reason for GPU reset; or NULL for no dev_notice() + * + * Reset a specific GPU engine. Useful if a hang is detected. + * Returns zero on successful reset or otherwise an error code. + * + * Procedure is: + * - identifies the request that caused the hang and it is dropped + * - reset engine (which will force the engine to idle) + * - re-init/configure engine + */ +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) +{ + struct i915_gpu_error *error = &engine->i915->gpu_error; + int ret; + + GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); + GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); + + if (!intel_wakeref_active(&engine->wakeref)) + return 0; + + reset_prepare_engine(engine); + + if (msg) + dev_notice(engine->i915->drm.dev, + "Resetting %s for %s\n", engine->name, msg); + error->reset_engine_count[engine->id]++; + + if (!engine->i915->guc.execbuf_client) + ret = intel_gt_reset_engine(engine->i915, engine); + else + ret = intel_guc_reset_engine(&engine->i915->guc, engine); + if (ret) { + /* If we fail here, we expect to fallback to a global reset */ + DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n", + engine->i915->guc.execbuf_client ? "GuC " : "", + engine->name, ret); + goto out; + } + + /* + * The request that caused the hang is stuck on elsp, we know the + * active request and can drop it, adjust head to skip the offending + * request to resume executing remaining requests in the queue. + */ + intel_engine_reset(engine, true); + + /* + * The engine and its registers (and workarounds in case of render) + * have been reset to their default values. Follow the init_ring + * process to program RING_MODE, HWSP and re-enable submission. + */ + ret = engine->resume(engine); + if (ret) + goto out; + +out: + intel_engine_cancel_stop_cs(engine); + reset_finish_engine(engine); + return ret; +} + +static void i915_reset_device(struct drm_i915_private *i915, + u32 engine_mask, + const char *reason) +{ + struct i915_gpu_error *error = &i915->gpu_error; + struct kobject *kobj = &i915->drm.primary->kdev->kobj; + char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; + char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; + char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; + struct i915_wedge_me w; + + kobject_uevent_env(kobj, KOBJ_CHANGE, error_event); + + DRM_DEBUG_DRIVER("resetting chip\n"); + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event); + + /* Use a watchdog to ensure that our reset completes */ + i915_wedge_on_timeout(&w, i915, 5 * HZ) { + intel_prepare_reset(i915); + + /* Flush everyone using a resource about to be clobbered */ + synchronize_srcu_expedited(&error->reset_backoff_srcu); + + i915_reset(i915, engine_mask, reason); + + intel_finish_reset(i915); + } + + if (!test_bit(I915_WEDGED, &error->flags)) + kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event); +} + +static void clear_register(struct intel_uncore *uncore, i915_reg_t reg) +{ + intel_uncore_rmw(uncore, reg, 0, 0); +} + +static void gen8_clear_engine_error_register(struct intel_engine_cs *engine) +{ + GEN6_RING_FAULT_REG_RMW(engine, RING_FAULT_VALID, 0); + GEN6_RING_FAULT_REG_POSTING_READ(engine); +} + +static void clear_error_registers(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask) +{ + struct intel_uncore *uncore = &i915->uncore; + u32 eir; + + if (!IS_GEN(i915, 2)) + clear_register(uncore, PGTBL_ER); + + if (INTEL_GEN(i915) < 4) + clear_register(uncore, IPEIR(RENDER_RING_BASE)); + else + clear_register(uncore, IPEIR_I965); + + clear_register(uncore, EIR); + eir = intel_uncore_read(uncore, EIR); + if (eir) { + /* + * some errors might have become stuck, + * mask them. + */ + DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir); + rmw_set(uncore, EMR, eir); + intel_uncore_write(uncore, GEN2_IIR, + I915_MASTER_ERROR_INTERRUPT); + } + + if (INTEL_GEN(i915) >= 8) { + rmw_clear(uncore, GEN8_RING_FAULT_REG, RING_FAULT_VALID); + intel_uncore_posting_read(uncore, GEN8_RING_FAULT_REG); + } else if (INTEL_GEN(i915) >= 6) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine_masked(engine, i915, engine_mask, id) + gen8_clear_engine_error_register(engine); + } +} + +static void gen6_check_faults(struct drm_i915_private *dev_priv) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + u32 fault; + + for_each_engine(engine, dev_priv, id) { + fault = GEN6_RING_FAULT_REG_READ(engine); + if (fault & RING_FAULT_VALID) { + DRM_DEBUG_DRIVER("Unexpected fault\n" + "\tAddr: 0x%08lx\n" + "\tAddress space: %s\n" + "\tSource ID: %d\n" + "\tType: %d\n", + fault & PAGE_MASK, + fault & RING_FAULT_GTTSEL_MASK ? "GGTT" : "PPGTT", + RING_FAULT_SRCID(fault), + RING_FAULT_FAULT_TYPE(fault)); + } + } +} + +static void gen8_check_faults(struct drm_i915_private *dev_priv) +{ + u32 fault = I915_READ(GEN8_RING_FAULT_REG); + + if (fault & RING_FAULT_VALID) { + u32 fault_data0, fault_data1; + u64 fault_addr; + + fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0); + fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1); + fault_addr = ((u64)(fault_data1 & FAULT_VA_HIGH_BITS) << 44) | + ((u64)fault_data0 << 12); + + DRM_DEBUG_DRIVER("Unexpected fault\n" + "\tAddr: 0x%08x_%08x\n" + "\tAddress space: %s\n" + "\tEngine ID: %d\n" + "\tSource ID: %d\n" + "\tType: %d\n", + upper_32_bits(fault_addr), + lower_32_bits(fault_addr), + fault_data1 & FAULT_GTT_SEL ? "GGTT" : "PPGTT", + GEN8_RING_FAULT_ENGINE_ID(fault), + RING_FAULT_SRCID(fault), + RING_FAULT_FAULT_TYPE(fault)); + } +} + +void i915_check_and_clear_faults(struct drm_i915_private *i915) +{ + /* From GEN8 onwards we only have one 'All Engine Fault Register' */ + if (INTEL_GEN(i915) >= 8) + gen8_check_faults(i915); + else if (INTEL_GEN(i915) >= 6) + gen6_check_faults(i915); + else + return; + + clear_error_registers(i915, ALL_ENGINES); +} + +/** + * i915_handle_error - handle a gpu error + * @i915: i915 device private + * @engine_mask: mask representing engines that are hung + * @flags: control flags + * @fmt: Error message format string + * + * Do some basic checking of register state at error time and + * dump it to the syslog. Also call i915_capture_error_state() to make + * sure we get a record and make it available in debugfs. Fire a uevent + * so userspace knows something bad happened (should trigger collection + * of a ring dump etc.). + */ +void i915_handle_error(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned long flags, + const char *fmt, ...) +{ + struct i915_gpu_error *error = &i915->gpu_error; + struct intel_engine_cs *engine; + intel_wakeref_t wakeref; + intel_engine_mask_t tmp; + char error_msg[80]; + char *msg = NULL; + + if (fmt) { + va_list args; + + va_start(args, fmt); + vscnprintf(error_msg, sizeof(error_msg), fmt, args); + va_end(args); + + msg = error_msg; + } + + /* + * In most cases it's guaranteed that we get here with an RPM + * reference held, for example because there is a pending GPU + * request that won't finish until the reset is done. This + * isn't the case at least when we get here by doing a + * simulated reset via debugfs, so get an RPM reference. + */ + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + engine_mask &= INTEL_INFO(i915)->engine_mask; + + if (flags & I915_ERROR_CAPTURE) { + i915_capture_error_state(i915, engine_mask, msg); + clear_error_registers(i915, engine_mask); + } + + /* + * Try engine reset when available. We fall back to full reset if + * single reset fails. + */ + if (intel_has_reset_engine(i915) && !__i915_wedged(error)) { + for_each_engine_masked(engine, i915, engine_mask, tmp) { + BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE); + if (test_and_set_bit(I915_RESET_ENGINE + engine->id, + &error->flags)) + continue; + + if (i915_reset_engine(engine, msg) == 0) + engine_mask &= ~engine->mask; + + clear_bit(I915_RESET_ENGINE + engine->id, + &error->flags); + wake_up_bit(&error->flags, + I915_RESET_ENGINE + engine->id); + } + } + + if (!engine_mask) + goto out; + + /* Full reset needs the mutex, stop any other user trying to do so. */ + if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) { + wait_event(error->reset_queue, + !test_bit(I915_RESET_BACKOFF, &error->flags)); + goto out; /* piggy-back on the other reset */ + } + + /* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */ + synchronize_rcu_expedited(); + + /* Prevent any other reset-engine attempt. */ + for_each_engine(engine, i915, tmp) { + while (test_and_set_bit(I915_RESET_ENGINE + engine->id, + &error->flags)) + wait_on_bit(&error->flags, + I915_RESET_ENGINE + engine->id, + TASK_UNINTERRUPTIBLE); + } + + i915_reset_device(i915, engine_mask, msg); + + for_each_engine(engine, i915, tmp) { + clear_bit(I915_RESET_ENGINE + engine->id, + &error->flags); + } + + clear_bit(I915_RESET_BACKOFF, &error->flags); + wake_up_all(&error->reset_queue); + +out: + intel_runtime_pm_put(&i915->runtime_pm, wakeref); +} + +int i915_reset_trylock(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + int srcu; + + might_lock(&error->reset_backoff_srcu); + might_sleep(); + + rcu_read_lock(); + while (test_bit(I915_RESET_BACKOFF, &error->flags)) { + rcu_read_unlock(); + + if (wait_event_interruptible(error->reset_queue, + !test_bit(I915_RESET_BACKOFF, + &error->flags))) + return -EINTR; + + rcu_read_lock(); + } + srcu = srcu_read_lock(&error->reset_backoff_srcu); + rcu_read_unlock(); + + return srcu; +} + +void i915_reset_unlock(struct drm_i915_private *i915, int tag) +__releases(&i915->gpu_error.reset_backoff_srcu) +{ + struct i915_gpu_error *error = &i915->gpu_error; + + srcu_read_unlock(&error->reset_backoff_srcu, tag); +} + +int i915_terminally_wedged(struct drm_i915_private *i915) +{ + struct i915_gpu_error *error = &i915->gpu_error; + + might_sleep(); + + if (!__i915_wedged(error)) + return 0; + + /* Reset still in progress? Maybe we will recover? */ + if (!test_bit(I915_RESET_BACKOFF, &error->flags)) + return -EIO; + + /* XXX intel_reset_finish() still takes struct_mutex!!! */ + if (mutex_is_locked(&i915->drm.struct_mutex)) + return -EAGAIN; + + if (wait_event_interruptible(error->reset_queue, + !test_bit(I915_RESET_BACKOFF, + &error->flags))) + return -EINTR; + + return __i915_wedged(error) ? -EIO : 0; +} + +static void i915_wedge_me(struct work_struct *work) +{ + struct i915_wedge_me *w = container_of(work, typeof(*w), work.work); + + dev_err(w->i915->drm.dev, + "%s timed out, cancelling all in-flight rendering.\n", + w->name); + i915_gem_set_wedged(w->i915); +} + +void __i915_init_wedge(struct i915_wedge_me *w, + struct drm_i915_private *i915, + long timeout, + const char *name) +{ + w->i915 = i915; + w->name = name; + + INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me); + schedule_delayed_work(&w->work, timeout); +} + +void __i915_fini_wedge(struct i915_wedge_me *w) +{ + cancel_delayed_work_sync(&w->work); + destroy_delayed_work_on_stack(&w->work); + w->i915 = NULL; +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_reset.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h new file mode 100644 index 000000000000..580ebdb59eca --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -0,0 +1,68 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2008-2018 Intel Corporation + */ + +#ifndef I915_RESET_H +#define I915_RESET_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/srcu.h> + +#include "gt/intel_engine_types.h" + +struct drm_i915_private; +struct i915_request; +struct intel_engine_cs; +struct intel_guc; + +__printf(4, 5) +void i915_handle_error(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask, + unsigned long flags, + const char *fmt, ...); +#define I915_ERROR_CAPTURE BIT(0) + +void i915_check_and_clear_faults(struct drm_i915_private *i915); + +void i915_reset(struct drm_i915_private *i915, + intel_engine_mask_t stalled_mask, + const char *reason); +int i915_reset_engine(struct intel_engine_cs *engine, + const char *reason); + +void i915_reset_request(struct i915_request *rq, bool guilty); + +int __must_check i915_reset_trylock(struct drm_i915_private *i915); +void i915_reset_unlock(struct drm_i915_private *i915, int tag); + +int i915_terminally_wedged(struct drm_i915_private *i915); + +bool intel_has_gpu_reset(struct drm_i915_private *i915); +bool intel_has_reset_engine(struct drm_i915_private *i915); + +int intel_gpu_reset(struct drm_i915_private *i915, + intel_engine_mask_t engine_mask); + +int intel_reset_guc(struct drm_i915_private *i915); + +struct i915_wedge_me { + struct delayed_work work; + struct drm_i915_private *i915; + const char *name; +}; + +void __i915_init_wedge(struct i915_wedge_me *w, + struct drm_i915_private *i915, + long timeout, + const char *name); +void __i915_fini_wedge(struct i915_wedge_me *w); + +#define i915_wedge_on_timeout(W, DEV, TIMEOUT) \ + for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__); \ + (W)->i915; \ + __i915_fini_wedge((W))) + +#endif /* I915_RESET_H */ diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c new file mode 100644 index 000000000000..c6023bc9452d --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c @@ -0,0 +1,2301 @@ +/* + * Copyright © 2008-2010 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Authors: + * Eric Anholt <eric@anholt.net> + * Zou Nan hai <nanhai.zou@intel.com> + * Xiang Hai hao<haihao.xiang@intel.com> + * + */ + +#include <linux/log2.h> + +#include <drm/i915_drm.h> + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" +#include "i915_gem_render_state.h" +#include "i915_trace.h" +#include "intel_context.h" +#include "intel_reset.h" +#include "intel_workarounds.h" + +/* Rough estimate of the typical request size, performing a flush, + * set-context and then emitting the batch. + */ +#define LEGACY_REQUEST_SIZE 200 + +unsigned int intel_ring_update_space(struct intel_ring *ring) +{ + unsigned int space; + + space = __intel_ring_space(ring->head, ring->emit, ring->size); + + ring->space = space; + return space; +} + +static int +gen2_render_ring_flush(struct i915_request *rq, u32 mode) +{ + unsigned int num_store_dw; + u32 cmd, *cs; + + cmd = MI_FLUSH; + num_store_dw = 0; + if (mode & EMIT_INVALIDATE) + cmd |= MI_READ_FLUSH; + if (mode & EMIT_FLUSH) + num_store_dw = 4; + + cs = intel_ring_begin(rq, 2 + 3 * num_store_dw); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = cmd; + while (num_store_dw--) { + *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + *cs++ = i915_scratch_offset(rq->i915); + *cs++ = 0; + } + *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen4_render_ring_flush(struct i915_request *rq, u32 mode) +{ + u32 cmd, *cs; + int i; + + /* + * read/write caches: + * + * I915_GEM_DOMAIN_RENDER is always invalidated, but is + * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is + * also flushed at 2d versus 3d pipeline switches. + * + * read-only caches: + * + * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if + * MI_READ_FLUSH is set, and is always flushed on 965. + * + * I915_GEM_DOMAIN_COMMAND may not exist? + * + * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is + * invalidated when MI_EXE_FLUSH is set. + * + * I915_GEM_DOMAIN_VERTEX, which exists on 965, is + * invalidated with every MI_FLUSH. + * + * TLBs: + * + * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND + * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and + * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER + * are flushed at any MI_FLUSH. + */ + + cmd = MI_FLUSH; + if (mode & EMIT_INVALIDATE) { + cmd |= MI_EXE_FLUSH; + if (IS_G4X(rq->i915) || IS_GEN(rq->i915, 5)) + cmd |= MI_INVALIDATE_ISP; + } + + i = 2; + if (mode & EMIT_INVALIDATE) + i += 20; + + cs = intel_ring_begin(rq, i); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = cmd; + + /* + * A random delay to let the CS invalidate take effect? Without this + * delay, the GPU relocation path fails as the CS does not see + * the updated contents. Just as important, if we apply the flushes + * to the EMIT_FLUSH branch (i.e. immediately after the relocation + * write and before the invalidate on the next batch), the relocations + * still fail. This implies that is a delay following invalidation + * that is required to reset the caches as opposed to a delay to + * ensure the memory is written. + */ + if (mode & EMIT_INVALIDATE) { + *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + + for (i = 0; i < 12; i++) + *cs++ = MI_FLUSH; + + *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + } + + *cs++ = cmd; + + intel_ring_advance(rq, cs); + + return 0; +} + +/* + * Emits a PIPE_CONTROL with a non-zero post-sync operation, for + * implementing two workarounds on gen6. From section 1.4.7.1 + * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: + * + * [DevSNB-C+{W/A}] Before any depth stall flush (including those + * produced by non-pipelined state commands), software needs to first + * send a PIPE_CONTROL with no bits set except Post-Sync Operation != + * 0. + * + * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable + * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. + * + * And the workaround for these two requires this workaround first: + * + * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent + * BEFORE the pipe-control with a post-sync op and no write-cache + * flushes. + * + * And this last workaround is tricky because of the requirements on + * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM + * volume 2 part 1: + * + * "1 of the following must also be set: + * - Render Target Cache Flush Enable ([12] of DW1) + * - Depth Cache Flush Enable ([0] of DW1) + * - Stall at Pixel Scoreboard ([1] of DW1) + * - Depth Stall ([13] of DW1) + * - Post-Sync Operation ([13] of DW1) + * - Notify Enable ([8] of DW1)" + * + * The cache flushes require the workaround flush that triggered this + * one, so we can't use it. Depth stall would trigger the same. + * Post-sync nonzero is what triggered this second workaround, so we + * can't use that one either. Notify enable is IRQs, which aren't + * really our business. That leaves only stall at scoreboard. + */ +static int +gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) +{ + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; + u32 *cs; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; /* low dword */ + *cs++ = 0; /* high dword */ + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(5); + *cs++ = PIPE_CONTROL_QW_WRITE; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen6_render_ring_flush(struct i915_request *rq, u32 mode) +{ + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; + u32 *cs, flags = 0; + int ret; + + /* Force SNB workarounds for PIPE_CONTROL flushes */ + ret = gen6_emit_post_sync_nonzero_flush(rq); + if (ret) + return ret; + + /* Just flush everything. Experiments have shown that reducing the + * number of bits based on the write domains has little performance + * impact. + */ + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + /* + * Ensure that any following seqno writes only happen + * when the render cache is indeed flushed. + */ + flags |= PIPE_CONTROL_CS_STALL; + } + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + /* + * TLB invalidate requires a post-sync write. + */ + flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = flags; + *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 *gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = 0; + *cs++ = 0; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_QW_WRITE; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + + /* Finally we can flush and with it emit the breadcrumb */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_CS_STALL); + *cs++ = rq->timeline->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +static int +gen7_render_ring_cs_stall_wa(struct i915_request *rq) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; + *cs++ = 0; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen7_render_ring_flush(struct i915_request *rq, u32 mode) +{ + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; + u32 *cs, flags = 0; + + /* + * Ensure that any following seqno writes only happen when the render + * cache is indeed flushed. + * + * Workaround: 4th PIPE_CONTROL command (except the ones with only + * read-cache invalidate bits set) must have the CS_STALL bit set. We + * don't try to be clever and just set it unconditionally. + */ + flags |= PIPE_CONTROL_CS_STALL; + + /* Just flush everything. Experiments have shown that reducing the + * number of bits based on the write domains has little performance + * impact. + */ + if (mode & EMIT_FLUSH) { + flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; + flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; + flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; + flags |= PIPE_CONTROL_FLUSH_ENABLE; + } + if (mode & EMIT_INVALIDATE) { + flags |= PIPE_CONTROL_TLB_INVALIDATE; + flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; + flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; + /* + * TLB invalidate requires a post-sync write. + */ + flags |= PIPE_CONTROL_QW_WRITE; + flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; + + flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; + + /* Workaround: we must issue a pipe_control with CS-stall bit + * set before a pipe_control command that has the state cache + * invalidate bit set. */ + gen7_render_ring_cs_stall_wa(rq); + } + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = flags; + *cs++ = scratch_addr; + *cs++ = 0; + intel_ring_advance(rq, cs); + + return 0; +} + +static u32 *gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE | + PIPE_CONTROL_QW_WRITE | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL); + *cs++ = rq->timeline->hwsp_offset; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +static u32 *gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +#define GEN7_XCS_WA 32 +static u32 *gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + int i; + + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = rq->fence.seqno; + + for (i = 0; i < GEN7_XCS_WA; i++) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + } + + *cs++ = MI_FLUSH_DW; + *cs++ = 0; + *cs++ = 0; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} +#undef GEN7_XCS_WA + +static void set_hwstam(struct intel_engine_cs *engine, u32 mask) +{ + /* + * Keep the render interrupt unmasked as this papers over + * lost interrupts following a reset. + */ + if (engine->class == RENDER_CLASS) { + if (INTEL_GEN(engine->i915) >= 6) + mask &= ~BIT(0); + else + mask &= ~I915_USER_INTERRUPT; + } + + intel_engine_set_hwsp_writemask(engine, mask); +} + +static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys) +{ + struct drm_i915_private *dev_priv = engine->i915; + u32 addr; + + addr = lower_32_bits(phys); + if (INTEL_GEN(dev_priv) >= 4) + addr |= (phys >> 28) & 0xf0; + + I915_WRITE(HWS_PGA, addr); +} + +static struct page *status_page(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *obj = engine->status_page.vma->obj; + + GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj)); + return sg_page(obj->mm.pages->sgl); +} + +static void ring_setup_phys_status_page(struct intel_engine_cs *engine) +{ + set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine)))); + set_hwstam(engine, ~0u); +} + +static void set_hwsp(struct intel_engine_cs *engine, u32 offset) +{ + struct drm_i915_private *dev_priv = engine->i915; + i915_reg_t hwsp; + + /* + * The ring status page addresses are no longer next to the rest of + * the ring registers as of gen7. + */ + if (IS_GEN(dev_priv, 7)) { + switch (engine->id) { + /* + * No more rings exist on Gen7. Default case is only to shut up + * gcc switch check warning. + */ + default: + GEM_BUG_ON(engine->id); + /* fallthrough */ + case RCS0: + hwsp = RENDER_HWS_PGA_GEN7; + break; + case BCS0: + hwsp = BLT_HWS_PGA_GEN7; + break; + case VCS0: + hwsp = BSD_HWS_PGA_GEN7; + break; + case VECS0: + hwsp = VEBOX_HWS_PGA_GEN7; + break; + } + } else if (IS_GEN(dev_priv, 6)) { + hwsp = RING_HWS_PGA_GEN6(engine->mmio_base); + } else { + hwsp = RING_HWS_PGA(engine->mmio_base); + } + + I915_WRITE(hwsp, offset); + POSTING_READ(hwsp); +} + +static void flush_cs_tlb(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + if (!IS_GEN_RANGE(dev_priv, 6, 7)) + return; + + /* ring should be idle before issuing a sync flush*/ + WARN_ON((ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0); + + ENGINE_WRITE(engine, RING_INSTPM, + _MASKED_BIT_ENABLE(INSTPM_TLB_INVALIDATE | + INSTPM_SYNC_FLUSH)); + if (intel_wait_for_register(engine->uncore, + RING_INSTPM(engine->mmio_base), + INSTPM_SYNC_FLUSH, 0, + 1000)) + DRM_ERROR("%s: wait for SyncFlush to complete for TLB invalidation timed out\n", + engine->name); +} + +static void ring_setup_status_page(struct intel_engine_cs *engine) +{ + set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma)); + set_hwstam(engine, ~0u); + + flush_cs_tlb(engine); +} + +static bool stop_ring(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + if (INTEL_GEN(dev_priv) > 2) { + ENGINE_WRITE(engine, + RING_MI_MODE, _MASKED_BIT_ENABLE(STOP_RING)); + if (intel_wait_for_register(engine->uncore, + RING_MI_MODE(engine->mmio_base), + MODE_IDLE, + MODE_IDLE, + 1000)) { + DRM_ERROR("%s : timed out trying to stop ring\n", + engine->name); + + /* + * Sometimes we observe that the idle flag is not + * set even though the ring is empty. So double + * check before giving up. + */ + if (ENGINE_READ(engine, RING_HEAD) != + ENGINE_READ(engine, RING_TAIL)) + return false; + } + } + + ENGINE_WRITE(engine, RING_HEAD, ENGINE_READ(engine, RING_TAIL)); + + ENGINE_WRITE(engine, RING_HEAD, 0); + ENGINE_WRITE(engine, RING_TAIL, 0); + + /* The ring must be empty before it is disabled */ + ENGINE_WRITE(engine, RING_CTL, 0); + + return (ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR) == 0; +} + +static int xcs_resume(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + struct intel_ring *ring = engine->buffer; + int ret = 0; + + GEM_TRACE("%s: ring:{HEAD:%04x, TAIL:%04x}\n", + engine->name, ring->head, ring->tail); + + intel_uncore_forcewake_get(engine->uncore, FORCEWAKE_ALL); + + if (!stop_ring(engine)) { + /* G45 ring initialization often fails to reset head to zero */ + DRM_DEBUG_DRIVER("%s head not reset to zero " + "ctl %08x head %08x tail %08x start %08x\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_HEAD), + ENGINE_READ(engine, RING_TAIL), + ENGINE_READ(engine, RING_START)); + + if (!stop_ring(engine)) { + DRM_ERROR("failed to set %s head to zero " + "ctl %08x head %08x tail %08x start %08x\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_HEAD), + ENGINE_READ(engine, RING_TAIL), + ENGINE_READ(engine, RING_START)); + ret = -EIO; + goto out; + } + } + + if (HWS_NEEDS_PHYSICAL(dev_priv)) + ring_setup_phys_status_page(engine); + else + ring_setup_status_page(engine); + + intel_engine_reset_breadcrumbs(engine); + + /* Enforce ordering by reading HEAD register back */ + ENGINE_READ(engine, RING_HEAD); + + /* Initialize the ring. This must happen _after_ we've cleared the ring + * registers with the above sequence (the readback of the HEAD registers + * also enforces ordering), otherwise the hw might lose the new ring + * register values. */ + ENGINE_WRITE(engine, RING_START, i915_ggtt_offset(ring->vma)); + + /* WaClearRingBufHeadRegAtInit:ctg,elk */ + if (ENGINE_READ(engine, RING_HEAD)) + DRM_DEBUG_DRIVER("%s initialization failed [head=%08x], fudging\n", + engine->name, ENGINE_READ(engine, RING_HEAD)); + + /* Check that the ring offsets point within the ring! */ + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head)); + GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail)); + intel_ring_update_space(ring); + + /* First wake the ring up to an empty/idle ring */ + ENGINE_WRITE(engine, RING_HEAD, ring->head); + ENGINE_WRITE(engine, RING_TAIL, ring->head); + ENGINE_POSTING_READ(engine, RING_TAIL); + + ENGINE_WRITE(engine, RING_CTL, RING_CTL_SIZE(ring->size) | RING_VALID); + + /* If the head is still not zero, the ring is dead */ + if (intel_wait_for_register(engine->uncore, + RING_CTL(engine->mmio_base), + RING_VALID, RING_VALID, + 50)) { + DRM_ERROR("%s initialization failed " + "ctl %08x (valid? %d) head %08x [%08x] tail %08x [%08x] start %08x [expected %08x]\n", + engine->name, + ENGINE_READ(engine, RING_CTL), + ENGINE_READ(engine, RING_CTL) & RING_VALID, + ENGINE_READ(engine, RING_HEAD), ring->head, + ENGINE_READ(engine, RING_TAIL), ring->tail, + ENGINE_READ(engine, RING_START), + i915_ggtt_offset(ring->vma)); + ret = -EIO; + goto out; + } + + if (INTEL_GEN(dev_priv) > 2) + ENGINE_WRITE(engine, + RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING)); + + /* Now awake, let it get started */ + if (ring->tail != ring->head) { + ENGINE_WRITE(engine, RING_TAIL, ring->tail); + ENGINE_POSTING_READ(engine, RING_TAIL); + } + + /* Papering over lost _interrupts_ immediately following the restart */ + intel_engine_queue_breadcrumbs(engine); +out: + intel_uncore_forcewake_put(engine->uncore, FORCEWAKE_ALL); + + return ret; +} + +static void reset_prepare(struct intel_engine_cs *engine) +{ + intel_engine_stop_cs(engine); +} + +static void reset_ring(struct intel_engine_cs *engine, bool stalled) +{ + struct i915_request *pos, *rq; + unsigned long flags; + u32 head; + + rq = NULL; + spin_lock_irqsave(&engine->active.lock, flags); + list_for_each_entry(pos, &engine->active.requests, sched.link) { + if (!i915_request_completed(pos)) { + rq = pos; + break; + } + } + + /* + * The guilty request will get skipped on a hung engine. + * + * Users of client default contexts do not rely on logical + * state preserved between batches so it is safe to execute + * queued requests following the hang. Non default contexts + * rely on preserved state, so skipping a batch loses the + * evolution of the state and it needs to be considered corrupted. + * Executing more queued batches on top of corrupted state is + * risky. But we take the risk by trying to advance through + * the queued requests in order to make the client behaviour + * more predictable around resets, by not throwing away random + * amount of batches it has prepared for execution. Sophisticated + * clients can use gem_reset_stats_ioctl and dma fence status + * (exported via sync_file info ioctl on explicit fences) to observe + * when it loses the context state and should rebuild accordingly. + * + * The context ban, and ultimately the client ban, mechanism are safety + * valves if client submission ends up resulting in nothing more than + * subsequent hangs. + */ + + if (rq) { + /* + * Try to restore the logical GPU state to match the + * continuation of the request queue. If we skip the + * context/PD restore, then the next request may try to execute + * assuming that its context is valid and loaded on the GPU and + * so may try to access invalid memory, prompting repeated GPU + * hangs. + * + * If the request was guilty, we still restore the logical + * state in case the next request requires it (e.g. the + * aliasing ppgtt), but skip over the hung batch. + * + * If the request was innocent, we try to replay the request + * with the restored context. + */ + i915_reset_request(rq, stalled); + + GEM_BUG_ON(rq->ring != engine->buffer); + head = rq->head; + } else { + head = engine->buffer->tail; + } + engine->buffer->head = intel_ring_wrap(engine->buffer, head); + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void reset_finish(struct intel_engine_cs *engine) +{ +} + +static int intel_rcs_ctx_init(struct i915_request *rq) +{ + int ret; + + ret = intel_engine_emit_ctx_wa(rq); + if (ret != 0) + return ret; + + ret = i915_gem_render_state_emit(rq); + if (ret) + return ret; + + return 0; +} + +static int rcs_resume(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + /* + * Disable CONSTANT_BUFFER before it is loaded from the context + * image. For as it is loaded, it is executed and the stored + * address may no longer be valid, leading to a GPU hang. + * + * This imposes the requirement that userspace reload their + * CONSTANT_BUFFER on every batch, fortunately a requirement + * they are already accustomed to from before contexts were + * enabled. + */ + if (IS_GEN(dev_priv, 4)) + I915_WRITE(ECOSKPD, + _MASKED_BIT_ENABLE(ECO_CONSTANT_BUFFER_SR_DISABLE)); + + /* WaTimedSingleVertexDispatch:cl,bw,ctg,elk,ilk,snb */ + if (IS_GEN_RANGE(dev_priv, 4, 6)) + I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(VS_TIMER_DISPATCH)); + + /* We need to disable the AsyncFlip performance optimisations in order + * to use MI_WAIT_FOR_EVENT within the CS. It should already be + * programmed to '1' on all products. + * + * WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv + */ + if (IS_GEN_RANGE(dev_priv, 6, 7)) + I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE)); + + /* Required for the hardware to program scanline values for waiting */ + /* WaEnableFlushTlbInvalidationMode:snb */ + if (IS_GEN(dev_priv, 6)) + I915_WRITE(GFX_MODE, + _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT)); + + /* WaBCSVCSTlbInvalidationMode:ivb,vlv,hsw */ + if (IS_GEN(dev_priv, 7)) + I915_WRITE(GFX_MODE_GEN7, + _MASKED_BIT_ENABLE(GFX_TLB_INVALIDATE_EXPLICIT) | + _MASKED_BIT_ENABLE(GFX_REPLAY_MODE)); + + if (IS_GEN(dev_priv, 6)) { + /* From the Sandybridge PRM, volume 1 part 3, page 24: + * "If this bit is set, STCunit will have LRA as replacement + * policy. [...] This bit must be reset. LRA replacement + * policy is not supported." + */ + I915_WRITE(CACHE_MODE_0, + _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB)); + } + + if (IS_GEN_RANGE(dev_priv, 6, 7)) + I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING)); + + return xcs_resume(engine); +} + +static void cancel_requests(struct intel_engine_cs *engine) +{ + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + /* Mark all submitted requests as skipped. */ + list_for_each_entry(request, &engine->active.requests, sched.link) { + if (!i915_request_signaled(request)) + dma_fence_set_error(&request->fence, -EIO); + + i915_request_mark_complete(request); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +static void i9xx_submit_request(struct i915_request *request) +{ + i915_request_submit(request); + + ENGINE_WRITE(request->engine, RING_TAIL, + intel_ring_set_tail(request->ring, request->tail)); +} + +static u32 *i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH; + + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} + +#define GEN5_WA_STORES 8 /* must be at least 1! */ +static u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) +{ + int i; + + GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma); + GEM_BUG_ON(offset_in_page(rq->timeline->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); + + *cs++ = MI_FLUSH; + + BUILD_BUG_ON(GEN5_WA_STORES < 1); + for (i = 0; i < GEN5_WA_STORES; i++) { + *cs++ = MI_STORE_DWORD_INDEX; + *cs++ = I915_GEM_HWS_SEQNO_ADDR; + *cs++ = rq->fence.seqno; + } + + *cs++ = MI_USER_INTERRUPT; + + rq->tail = intel_ring_offset(rq, cs); + assert_ring_tail_valid(rq->ring, rq->tail); + + return cs; +} +#undef GEN5_WA_STORES + +static void +gen5_irq_enable(struct intel_engine_cs *engine) +{ + gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask); +} + +static void +gen5_irq_disable(struct intel_engine_cs *engine) +{ + gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask); +} + +static void +i9xx_irq_enable(struct intel_engine_cs *engine) +{ + engine->i915->irq_mask &= ~engine->irq_enable_mask; + intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); + intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR); +} + +static void +i9xx_irq_disable(struct intel_engine_cs *engine) +{ + engine->i915->irq_mask |= engine->irq_enable_mask; + intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); +} + +static void +i8xx_irq_enable(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + i915->irq_mask &= ~engine->irq_enable_mask; + intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); + ENGINE_POSTING_READ16(engine, RING_IMR); +} + +static void +i8xx_irq_disable(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + i915->irq_mask |= engine->irq_enable_mask; + intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); +} + +static int +bsd_ring_flush(struct i915_request *rq, u32 mode) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_FLUSH; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + return 0; +} + +static void +gen6_irq_enable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, + ~(engine->irq_enable_mask | engine->irq_keep_mask)); + + /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ + ENGINE_POSTING_READ(engine, RING_IMR); + + gen5_enable_gt_irq(engine->i915, engine->irq_enable_mask); +} + +static void +gen6_irq_disable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); + gen5_disable_gt_irq(engine->i915, engine->irq_enable_mask); +} + +static void +hsw_vebox_irq_enable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); + + /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ + ENGINE_POSTING_READ(engine, RING_IMR); + + gen6_unmask_pm_irq(engine->i915, engine->irq_enable_mask); +} + +static void +hsw_vebox_irq_disable(struct intel_engine_cs *engine) +{ + ENGINE_WRITE(engine, RING_IMR, ~0); + gen6_mask_pm_irq(engine->i915, engine->irq_enable_mask); +} + +static int +i965_emit_bb_start(struct i915_request *rq, + u64 offset, u32 length, + unsigned int dispatch_flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | (dispatch_flags & + I915_DISPATCH_SECURE ? 0 : MI_BATCH_NON_SECURE_I965); + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +/* Just userspace ABI convention to limit the wa batch bo to a resonable size */ +#define I830_BATCH_LIMIT SZ_256K +#define I830_TLB_ENTRIES (2) +#define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT) +static int +i830_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs, cs_offset = i915_scratch_offset(rq->i915); + + GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE); + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Evict the invalid PTE TLBs */ + *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA; + *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096; + *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */ + *cs++ = cs_offset; + *cs++ = 0xdeadbeef; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) { + if (len > I830_BATCH_LIMIT) + return -ENOSPC; + + cs = intel_ring_begin(rq, 6 + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Blit the batch (which has now all relocs applied) to the + * stable batch scratch bo area (so that the CS never + * stumbles over its tlb invalidation bug) ... + */ + *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA; + *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096; + *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096; + *cs++ = cs_offset; + *cs++ = 4096; + *cs++ = offset; + + *cs++ = MI_FLUSH; + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + /* ... and execute it. */ + offset = cs_offset; + } + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; + *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 : + MI_BATCH_NON_SECURE); + intel_ring_advance(rq, cs); + + return 0; +} + +static int +i915_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; + *cs++ = offset | (dispatch_flags & I915_DISPATCH_SECURE ? 0 : + MI_BATCH_NON_SECURE); + intel_ring_advance(rq, cs); + + return 0; +} + +int intel_ring_pin(struct intel_ring *ring) +{ + struct i915_vma *vma = ring->vma; + enum i915_map_type map = i915_coherent_map_type(vma->vm->i915); + unsigned int flags; + void *addr; + int ret; + + GEM_BUG_ON(ring->vaddr); + + ret = i915_timeline_pin(ring->timeline); + if (ret) + return ret; + + flags = PIN_GLOBAL; + + /* Ring wraparound at offset 0 sometimes hangs. No idea why. */ + flags |= PIN_OFFSET_BIAS | i915_ggtt_pin_bias(vma); + + if (vma->obj->stolen) + flags |= PIN_MAPPABLE; + else + flags |= PIN_HIGH; + + ret = i915_vma_pin(vma, 0, 0, flags); + if (unlikely(ret)) + goto unpin_timeline; + + if (i915_vma_is_map_and_fenceable(vma)) + addr = (void __force *)i915_vma_pin_iomap(vma); + else + addr = i915_gem_object_pin_map(vma->obj, map); + if (IS_ERR(addr)) { + ret = PTR_ERR(addr); + goto unpin_ring; + } + + vma->obj->pin_global++; + + ring->vaddr = addr; + return 0; + +unpin_ring: + i915_vma_unpin(vma); +unpin_timeline: + i915_timeline_unpin(ring->timeline); + return ret; +} + +void intel_ring_reset(struct intel_ring *ring, u32 tail) +{ + GEM_BUG_ON(!intel_ring_offset_valid(ring, tail)); + + ring->tail = tail; + ring->head = tail; + ring->emit = tail; + intel_ring_update_space(ring); +} + +void intel_ring_unpin(struct intel_ring *ring) +{ + GEM_BUG_ON(!ring->vma); + GEM_BUG_ON(!ring->vaddr); + + /* Discard any unused bytes beyond that submitted to hw. */ + intel_ring_reset(ring, ring->tail); + + if (i915_vma_is_map_and_fenceable(ring->vma)) + i915_vma_unpin_iomap(ring->vma); + else + i915_gem_object_unpin_map(ring->vma->obj); + ring->vaddr = NULL; + + ring->vma->obj->pin_global--; + i915_vma_unpin(ring->vma); + + i915_timeline_unpin(ring->timeline); +} + +static struct i915_vma * +intel_ring_create_vma(struct drm_i915_private *dev_priv, int size) +{ + struct i915_address_space *vm = &dev_priv->ggtt.vm; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + + obj = i915_gem_object_create_stolen(dev_priv, size); + if (!obj) + obj = i915_gem_object_create_internal(dev_priv, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + /* + * Mark ring buffers as read-only from GPU side (so no stray overwrites) + * if supported by the platform's GGTT. + */ + if (vm->has_read_only) + i915_gem_object_set_readonly(obj); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) + goto err; + + return vma; + +err: + i915_gem_object_put(obj); + return vma; +} + +struct intel_ring * +intel_engine_create_ring(struct intel_engine_cs *engine, + struct i915_timeline *timeline, + int size) +{ + struct intel_ring *ring; + struct i915_vma *vma; + + GEM_BUG_ON(!is_power_of_2(size)); + GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES); + + ring = kzalloc(sizeof(*ring), GFP_KERNEL); + if (!ring) + return ERR_PTR(-ENOMEM); + + kref_init(&ring->ref); + INIT_LIST_HEAD(&ring->request_list); + ring->timeline = i915_timeline_get(timeline); + + ring->size = size; + /* Workaround an erratum on the i830 which causes a hang if + * the TAIL pointer points to within the last 2 cachelines + * of the buffer. + */ + ring->effective_size = size; + if (IS_I830(engine->i915) || IS_I845G(engine->i915)) + ring->effective_size -= 2 * CACHELINE_BYTES; + + intel_ring_update_space(ring); + + vma = intel_ring_create_vma(engine->i915, size); + if (IS_ERR(vma)) { + kfree(ring); + return ERR_CAST(vma); + } + ring->vma = vma; + + return ring; +} + +void intel_ring_free(struct kref *ref) +{ + struct intel_ring *ring = container_of(ref, typeof(*ring), ref); + + i915_vma_close(ring->vma); + i915_vma_put(ring->vma); + + i915_timeline_put(ring->timeline); + kfree(ring); +} + +static void __ring_context_fini(struct intel_context *ce) +{ + GEM_BUG_ON(i915_gem_object_is_active(ce->state->obj)); + i915_gem_object_put(ce->state->obj); +} + +static void ring_context_destroy(struct kref *ref) +{ + struct intel_context *ce = container_of(ref, typeof(*ce), ref); + + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (ce->state) + __ring_context_fini(ce); + + intel_context_free(ce); +} + +static int __context_pin_ppgtt(struct i915_gem_context *ctx) +{ + struct i915_address_space *vm; + int err = 0; + + vm = ctx->vm ?: &ctx->i915->mm.aliasing_ppgtt->vm; + if (vm) + err = gen6_ppgtt_pin(i915_vm_to_ppgtt((vm))); + + return err; +} + +static void __context_unpin_ppgtt(struct i915_gem_context *ctx) +{ + struct i915_address_space *vm; + + vm = ctx->vm ?: &ctx->i915->mm.aliasing_ppgtt->vm; + if (vm) + gen6_ppgtt_unpin(i915_vm_to_ppgtt(vm)); +} + +static void ring_context_unpin(struct intel_context *ce) +{ + __context_unpin_ppgtt(ce->gem_context); +} + +static struct i915_vma * +alloc_context_vma(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_shmem(i915, engine->context_size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + /* + * Try to make the context utilize L3 as well as LLC. + * + * On VLV we don't have L3 controls in the PTEs so we + * shouldn't touch the cache level, especially as that + * would make the object snooped which might have a + * negative performance impact. + * + * Snooping is required on non-llc platforms in execlist + * mode, but since all GGTT accesses use PAT entry 0 we + * get snooping anyway regardless of cache_level. + * + * This is only applicable for Ivy Bridge devices since + * later platforms don't have L3 control bits in the PTE. + */ + if (IS_IVYBRIDGE(i915)) + i915_gem_object_set_cache_coherency(obj, I915_CACHE_L3_LLC); + + if (engine->default_state) { + void *defaults, *vaddr; + + vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + + defaults = i915_gem_object_pin_map(engine->default_state, + I915_MAP_WB); + if (IS_ERR(defaults)) { + err = PTR_ERR(defaults); + goto err_map; + } + + memcpy(vaddr, defaults, engine->context_size); + i915_gem_object_unpin_map(engine->default_state); + + i915_gem_object_flush_map(obj); + i915_gem_object_unpin_map(obj); + } + + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + return vma; + +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static int ring_context_pin(struct intel_context *ce) +{ + struct intel_engine_cs *engine = ce->engine; + int err; + + /* One ringbuffer to rule them all */ + GEM_BUG_ON(!engine->buffer); + ce->ring = engine->buffer; + + if (!ce->state && engine->context_size) { + struct i915_vma *vma; + + vma = alloc_context_vma(engine); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + ce->state = vma; + } + + err = intel_context_active_acquire(ce, PIN_HIGH); + if (err) + return err; + + err = __context_pin_ppgtt(ce->gem_context); + if (err) + goto err_active; + + return 0; + +err_active: + intel_context_active_release(ce); + return err; +} + +static void ring_context_reset(struct intel_context *ce) +{ + intel_ring_reset(ce->ring, 0); +} + +static const struct intel_context_ops ring_context_ops = { + .pin = ring_context_pin, + .unpin = ring_context_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .reset = ring_context_reset, + .destroy = ring_context_destroy, +}; + +static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt) +{ + const struct intel_engine_cs * const engine = rq->engine; + u32 *cs; + + cs = intel_ring_begin(rq, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base)); + *cs++ = PP_DIR_DCLV_2G; + + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); + *cs++ = ppgtt->pd->base.ggtt_offset << 10; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int flush_pd_dir(struct i915_request *rq) +{ + const struct intel_engine_cs * const engine = rq->engine; + u32 *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Stall until the page table load is complete */ + *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base)); + *cs++ = i915_scratch_offset(rq->i915); + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + return 0; +} + +static inline int mi_set_context(struct i915_request *rq, u32 flags) +{ + struct drm_i915_private *i915 = rq->i915; + struct intel_engine_cs *engine = rq->engine; + enum intel_engine_id id; + const int num_engines = + IS_HSW_GT1(i915) ? RUNTIME_INFO(i915)->num_engines - 1 : 0; + bool force_restore = false; + int len; + u32 *cs; + + flags |= MI_MM_SPACE_GTT; + if (IS_HASWELL(i915)) + /* These flags are for resource streamer on HSW+ */ + flags |= HSW_MI_RS_SAVE_STATE_EN | HSW_MI_RS_RESTORE_STATE_EN; + else + /* We need to save the extended state for powersaving modes */ + flags |= MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN; + + len = 4; + if (IS_GEN(i915, 7)) + len += 2 + (num_engines ? 4 * num_engines + 6 : 0); + else if (IS_GEN(i915, 5)) + len += 2; + if (flags & MI_FORCE_RESTORE) { + GEM_BUG_ON(flags & MI_RESTORE_INHIBIT); + flags &= ~MI_FORCE_RESTORE; + force_restore = true; + len += 2; + } + + cs = intel_ring_begin(rq, len); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw,bdw,chv */ + if (IS_GEN(i915, 7)) { + *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + if (num_engines) { + struct intel_engine_cs *signaller; + + *cs++ = MI_LOAD_REGISTER_IMM(num_engines); + for_each_engine(signaller, i915, id) { + if (signaller == engine) + continue; + + *cs++ = i915_mmio_reg_offset( + RING_PSMI_CTL(signaller->mmio_base)); + *cs++ = _MASKED_BIT_ENABLE( + GEN6_PSMI_SLEEP_MSG_DISABLE); + } + } + } else if (IS_GEN(i915, 5)) { + /* + * This w/a is only listed for pre-production ilk a/b steppings, + * but is also mentioned for programming the powerctx. To be + * safe, just apply the workaround; we do not use SyncFlush so + * this should never take effect and so be a no-op! + */ + *cs++ = MI_SUSPEND_FLUSH | MI_SUSPEND_FLUSH_EN; + } + + if (force_restore) { + /* + * The HW doesn't handle being told to restore the current + * context very well. Quite often it likes goes to go off and + * sulk, especially when it is meant to be reloading PP_DIR. + * A very simple fix to force the reload is to simply switch + * away from the current context and back again. + * + * Note that the kernel_context will contain random state + * following the INHIBIT_RESTORE. We accept this since we + * never use the kernel_context state; it is merely a + * placeholder we use to flush other contexts. + */ + *cs++ = MI_SET_CONTEXT; + *cs++ = i915_ggtt_offset(engine->kernel_context->state) | + MI_MM_SPACE_GTT | + MI_RESTORE_INHIBIT; + } + + *cs++ = MI_NOOP; + *cs++ = MI_SET_CONTEXT; + *cs++ = i915_ggtt_offset(rq->hw_context->state) | flags; + /* + * w/a: MI_SET_CONTEXT must always be followed by MI_NOOP + * WaMiSetContext_Hang:snb,ivb,vlv + */ + *cs++ = MI_NOOP; + + if (IS_GEN(i915, 7)) { + if (num_engines) { + struct intel_engine_cs *signaller; + i915_reg_t last_reg = {}; /* keep gcc quiet */ + + *cs++ = MI_LOAD_REGISTER_IMM(num_engines); + for_each_engine(signaller, i915, id) { + if (signaller == engine) + continue; + + last_reg = RING_PSMI_CTL(signaller->mmio_base); + *cs++ = i915_mmio_reg_offset(last_reg); + *cs++ = _MASKED_BIT_DISABLE( + GEN6_PSMI_SLEEP_MSG_DISABLE); + } + + /* Insert a delay before the next switch! */ + *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + *cs++ = i915_mmio_reg_offset(last_reg); + *cs++ = i915_scratch_offset(rq->i915); + *cs++ = MI_NOOP; + } + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + } else if (IS_GEN(i915, 5)) { + *cs++ = MI_SUSPEND_FLUSH; + } + + intel_ring_advance(rq, cs); + + return 0; +} + +static int remap_l3(struct i915_request *rq, int slice) +{ + u32 *cs, *remap_info = rq->i915->l3_parity.remap_info[slice]; + int i; + + if (!remap_info) + return 0; + + cs = intel_ring_begin(rq, GEN7_L3LOG_SIZE/4 * 2 + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* + * Note: We do not worry about the concurrent register cacheline hang + * here because no other code should access these registers other than + * at initialization time. + */ + *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4); + for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) { + *cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i)); + *cs++ = remap_info[i]; + } + *cs++ = MI_NOOP; + intel_ring_advance(rq, cs); + + return 0; +} + +static int switch_context(struct i915_request *rq) +{ + struct intel_engine_cs *engine = rq->engine; + struct i915_gem_context *ctx = rq->gem_context; + struct i915_address_space *vm = + ctx->vm ?: &rq->i915->mm.aliasing_ppgtt->vm; + unsigned int unwind_mm = 0; + u32 hw_flags = 0; + int ret, i; + + GEM_BUG_ON(HAS_EXECLISTS(rq->i915)); + + if (vm) { + struct i915_ppgtt *ppgtt = i915_vm_to_ppgtt(vm); + int loops; + + /* + * Baytail takes a little more convincing that it really needs + * to reload the PD between contexts. It is not just a little + * longer, as adding more stalls after the load_pd_dir (i.e. + * adding a long loop around flush_pd_dir) is not as effective + * as reloading the PD umpteen times. 32 is derived from + * experimentation (gem_exec_parallel/fds) and has no good + * explanation. + */ + loops = 1; + if (engine->id == BCS0 && IS_VALLEYVIEW(engine->i915)) + loops = 32; + + do { + ret = load_pd_dir(rq, ppgtt); + if (ret) + goto err; + } while (--loops); + + if (ppgtt->pd_dirty_engines & engine->mask) { + unwind_mm = engine->mask; + ppgtt->pd_dirty_engines &= ~unwind_mm; + hw_flags = MI_FORCE_RESTORE; + } + } + + if (rq->hw_context->state) { + GEM_BUG_ON(engine->id != RCS0); + + /* + * The kernel context(s) is treated as pure scratch and is not + * expected to retain any state (as we sacrifice it during + * suspend and on resume it may be corrupted). This is ok, + * as nothing actually executes using the kernel context; it + * is purely used for flushing user contexts. + */ + if (i915_gem_context_is_kernel(ctx)) + hw_flags = MI_RESTORE_INHIBIT; + + ret = mi_set_context(rq, hw_flags); + if (ret) + goto err_mm; + } + + if (vm) { + ret = engine->emit_flush(rq, EMIT_INVALIDATE); + if (ret) + goto err_mm; + + ret = flush_pd_dir(rq); + if (ret) + goto err_mm; + + /* + * Not only do we need a full barrier (post-sync write) after + * invalidating the TLBs, but we need to wait a little bit + * longer. Whether this is merely delaying us, or the + * subsequent flush is a key part of serialising with the + * post-sync op, this extra pass appears vital before a + * mm switch! + */ + ret = engine->emit_flush(rq, EMIT_INVALIDATE); + if (ret) + goto err_mm; + + ret = engine->emit_flush(rq, EMIT_FLUSH); + if (ret) + goto err_mm; + } + + if (ctx->remap_slice) { + for (i = 0; i < MAX_L3_SLICES; i++) { + if (!(ctx->remap_slice & BIT(i))) + continue; + + ret = remap_l3(rq, i); + if (ret) + goto err_mm; + } + + ctx->remap_slice = 0; + } + + return 0; + +err_mm: + if (unwind_mm) + i915_vm_to_ppgtt(vm)->pd_dirty_engines |= unwind_mm; +err: + return ret; +} + +static int ring_request_alloc(struct i915_request *request) +{ + int ret; + + GEM_BUG_ON(!intel_context_is_pinned(request->hw_context)); + GEM_BUG_ON(request->timeline->has_initial_breadcrumb); + + /* + * Flush enough space to reduce the likelihood of waiting after + * we start building the request - in which case we will just + * have to repeat work. + */ + request->reserved_space += LEGACY_REQUEST_SIZE; + + /* Unconditionally invalidate GPU caches and TLBs. */ + ret = request->engine->emit_flush(request, EMIT_INVALIDATE); + if (ret) + return ret; + + ret = switch_context(request); + if (ret) + return ret; + + request->reserved_space -= LEGACY_REQUEST_SIZE; + return 0; +} + +static noinline int wait_for_space(struct intel_ring *ring, unsigned int bytes) +{ + struct i915_request *target; + long timeout; + + if (intel_ring_update_space(ring) >= bytes) + return 0; + + GEM_BUG_ON(list_empty(&ring->request_list)); + list_for_each_entry(target, &ring->request_list, ring_link) { + /* Would completion of this request free enough space? */ + if (bytes <= __intel_ring_space(target->postfix, + ring->emit, ring->size)) + break; + } + + if (WARN_ON(&target->ring_link == &ring->request_list)) + return -ENOSPC; + + timeout = i915_request_wait(target, + I915_WAIT_INTERRUPTIBLE, + MAX_SCHEDULE_TIMEOUT); + if (timeout < 0) + return timeout; + + i915_request_retire_upto(target); + + intel_ring_update_space(ring); + GEM_BUG_ON(ring->space < bytes); + return 0; +} + +u32 *intel_ring_begin(struct i915_request *rq, unsigned int num_dwords) +{ + struct intel_ring *ring = rq->ring; + const unsigned int remain_usable = ring->effective_size - ring->emit; + const unsigned int bytes = num_dwords * sizeof(u32); + unsigned int need_wrap = 0; + unsigned int total_bytes; + u32 *cs; + + /* Packets must be qword aligned. */ + GEM_BUG_ON(num_dwords & 1); + + total_bytes = bytes + rq->reserved_space; + GEM_BUG_ON(total_bytes > ring->effective_size); + + if (unlikely(total_bytes > remain_usable)) { + const int remain_actual = ring->size - ring->emit; + + if (bytes > remain_usable) { + /* + * Not enough space for the basic request. So need to + * flush out the remainder and then wait for + * base + reserved. + */ + total_bytes += remain_actual; + need_wrap = remain_actual | 1; + } else { + /* + * The base request will fit but the reserved space + * falls off the end. So we don't need an immediate + * wrap and only need to effectively wait for the + * reserved size from the start of ringbuffer. + */ + total_bytes = rq->reserved_space + remain_actual; + } + } + + if (unlikely(total_bytes > ring->space)) { + int ret; + + /* + * Space is reserved in the ringbuffer for finalising the + * request, as that cannot be allowed to fail. During request + * finalisation, reserved_space is set to 0 to stop the + * overallocation and the assumption is that then we never need + * to wait (which has the risk of failing with EINTR). + * + * See also i915_request_alloc() and i915_request_add(). + */ + GEM_BUG_ON(!rq->reserved_space); + + ret = wait_for_space(ring, total_bytes); + if (unlikely(ret)) + return ERR_PTR(ret); + } + + if (unlikely(need_wrap)) { + need_wrap &= ~1; + GEM_BUG_ON(need_wrap > ring->space); + GEM_BUG_ON(ring->emit + need_wrap > ring->size); + GEM_BUG_ON(!IS_ALIGNED(need_wrap, sizeof(u64))); + + /* Fill the tail with MI_NOOP */ + memset64(ring->vaddr + ring->emit, 0, need_wrap / sizeof(u64)); + ring->space -= need_wrap; + ring->emit = 0; + } + + GEM_BUG_ON(ring->emit > ring->size - bytes); + GEM_BUG_ON(ring->space < bytes); + cs = ring->vaddr + ring->emit; + GEM_DEBUG_EXEC(memset32(cs, POISON_INUSE, bytes / sizeof(*cs))); + ring->emit += bytes; + ring->space -= bytes; + + return cs; +} + +/* Align the ring tail to a cacheline boundary */ +int intel_ring_cacheline_align(struct i915_request *rq) +{ + int num_dwords; + void *cs; + + num_dwords = (rq->ring->emit & (CACHELINE_BYTES - 1)) / sizeof(u32); + if (num_dwords == 0) + return 0; + + num_dwords = CACHELINE_DWORDS - num_dwords; + GEM_BUG_ON(num_dwords & 1); + + cs = intel_ring_begin(rq, num_dwords); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + memset64(cs, (u64)MI_NOOP << 32 | MI_NOOP, num_dwords / 2); + intel_ring_advance(rq, cs); + + GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1)); + return 0; +} + +static void gen6_bsd_submit_request(struct i915_request *request) +{ + struct intel_uncore *uncore = request->engine->uncore; + + intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL); + + /* Every tail move must follow the sequence below */ + + /* Disable notification that the ring is IDLE. The GT + * will then assume that it is busy and bring it out of rc6. + */ + intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL, + _MASKED_BIT_ENABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); + + /* Clear the context id. Here be magic! */ + intel_uncore_write64_fw(uncore, GEN6_BSD_RNCID, 0x0); + + /* Wait for the ring not to be idle, i.e. for it to wake up. */ + if (__intel_wait_for_register_fw(uncore, + GEN6_BSD_SLEEP_PSMI_CONTROL, + GEN6_BSD_SLEEP_INDICATOR, + 0, + 1000, 0, NULL)) + DRM_ERROR("timed out waiting for the BSD ring to wake up\n"); + + /* Now that the ring is fully powered up, update the tail */ + i9xx_submit_request(request); + + /* Let the ring send IDLE messages to the GT again, + * and so let it sleep to conserve power when idle. + */ + intel_uncore_write_fw(uncore, GEN6_BSD_SLEEP_PSMI_CONTROL, + _MASKED_BIT_DISABLE(GEN6_BSD_SLEEP_MSG_DISABLE)); + + intel_uncore_forcewake_put(uncore, FORCEWAKE_ALL); +} + +static int mi_flush_dw(struct i915_request *rq, u32 flags) +{ + u32 cmd, *cs; + + cs = intel_ring_begin(rq, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_FLUSH_DW; + + /* + * We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + /* + * Bspec vol 1c.3 - blitter engine command streamer: + * "If ENABLED, all TLBs will be invalidated once the flush + * operation is complete. This bit is only valid when the + * Post-Sync Operation field is a value of 1h or 3h." + */ + cmd |= flags; + + *cs++ = cmd; + *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + return 0; +} + +static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) +{ + return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); +} + +static int gen6_bsd_ring_flush(struct i915_request *rq, u32 mode) +{ + return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); +} + +static int +hsw_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ? + 0 : MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW); + /* bit0-7 is the length on GEN6+ */ + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +static int +gen6_emit_bb_start(struct i915_request *rq, + u64 offset, u32 len, + unsigned int dispatch_flags) +{ + u32 *cs; + + cs = intel_ring_begin(rq, 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_BATCH_BUFFER_START | (dispatch_flags & I915_DISPATCH_SECURE ? + 0 : MI_BATCH_NON_SECURE_I965); + /* bit0-7 is the length on GEN6+ */ + *cs++ = offset; + intel_ring_advance(rq, cs); + + return 0; +} + +/* Blitter support (SandyBridge+) */ + +static int gen6_ring_flush(struct i915_request *rq, u32 mode) +{ + return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); +} + +static void i9xx_set_default_submission(struct intel_engine_cs *engine) +{ + engine->submit_request = i9xx_submit_request; + engine->cancel_requests = cancel_requests; + + engine->park = NULL; + engine->unpark = NULL; +} + +static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine) +{ + i9xx_set_default_submission(engine); + engine->submit_request = gen6_bsd_submit_request; +} + +static void ring_destroy(struct intel_engine_cs *engine) +{ + struct drm_i915_private *dev_priv = engine->i915; + + WARN_ON(INTEL_GEN(dev_priv) > 2 && + (ENGINE_READ(engine, RING_MI_MODE) & MODE_IDLE) == 0); + + intel_ring_unpin(engine->buffer); + intel_ring_put(engine->buffer); + + intel_engine_cleanup_common(engine); + kfree(engine); +} + +static void setup_irq(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (INTEL_GEN(i915) >= 6) { + engine->irq_enable = gen6_irq_enable; + engine->irq_disable = gen6_irq_disable; + } else if (INTEL_GEN(i915) >= 5) { + engine->irq_enable = gen5_irq_enable; + engine->irq_disable = gen5_irq_disable; + } else if (INTEL_GEN(i915) >= 3) { + engine->irq_enable = i9xx_irq_enable; + engine->irq_disable = i9xx_irq_disable; + } else { + engine->irq_enable = i8xx_irq_enable; + engine->irq_disable = i8xx_irq_disable; + } +} + +static void setup_common(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + /* gen8+ are only supported with execlists */ + GEM_BUG_ON(INTEL_GEN(i915) >= 8); + + setup_irq(engine); + + engine->destroy = ring_destroy; + + engine->resume = xcs_resume; + engine->reset.prepare = reset_prepare; + engine->reset.reset = reset_ring; + engine->reset.finish = reset_finish; + + engine->cops = &ring_context_ops; + engine->request_alloc = ring_request_alloc; + + /* + * Using a global execution timeline; the previous final breadcrumb is + * equivalent to our next initial bread so we can elide + * engine->emit_init_breadcrumb(). + */ + engine->emit_fini_breadcrumb = i9xx_emit_breadcrumb; + if (IS_GEN(i915, 5)) + engine->emit_fini_breadcrumb = gen5_emit_breadcrumb; + + engine->set_default_submission = i9xx_set_default_submission; + + if (INTEL_GEN(i915) >= 6) + engine->emit_bb_start = gen6_emit_bb_start; + else if (INTEL_GEN(i915) >= 4) + engine->emit_bb_start = i965_emit_bb_start; + else if (IS_I830(i915) || IS_I845G(i915)) + engine->emit_bb_start = i830_emit_bb_start; + else + engine->emit_bb_start = i915_emit_bb_start; +} + +static void setup_rcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (HAS_L3_DPF(i915)) + engine->irq_keep_mask = GT_RENDER_L3_PARITY_ERROR_INTERRUPT; + + engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT; + + if (INTEL_GEN(i915) >= 7) { + engine->init_context = intel_rcs_ctx_init; + engine->emit_flush = gen7_render_ring_flush; + engine->emit_fini_breadcrumb = gen7_rcs_emit_breadcrumb; + } else if (IS_GEN(i915, 6)) { + engine->init_context = intel_rcs_ctx_init; + engine->emit_flush = gen6_render_ring_flush; + engine->emit_fini_breadcrumb = gen6_rcs_emit_breadcrumb; + } else if (IS_GEN(i915, 5)) { + engine->emit_flush = gen4_render_ring_flush; + } else { + if (INTEL_GEN(i915) < 4) + engine->emit_flush = gen2_render_ring_flush; + else + engine->emit_flush = gen4_render_ring_flush; + engine->irq_enable_mask = I915_USER_INTERRUPT; + } + + if (IS_HASWELL(i915)) + engine->emit_bb_start = hsw_emit_bb_start; + + engine->resume = rcs_resume; +} + +static void setup_vcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + if (INTEL_GEN(i915) >= 6) { + /* gen6 bsd needs a special wa for tail updates */ + if (IS_GEN(i915, 6)) + engine->set_default_submission = gen6_bsd_set_default_submission; + engine->emit_flush = gen6_bsd_ring_flush; + engine->irq_enable_mask = GT_BSD_USER_INTERRUPT; + + if (IS_GEN(i915, 6)) + engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb; + else + engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb; + } else { + engine->emit_flush = bsd_ring_flush; + if (IS_GEN(i915, 5)) + engine->irq_enable_mask = ILK_BSD_USER_INTERRUPT; + else + engine->irq_enable_mask = I915_BSD_USER_INTERRUPT; + } +} + +static void setup_bcs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + engine->emit_flush = gen6_ring_flush; + engine->irq_enable_mask = GT_BLT_USER_INTERRUPT; + + if (IS_GEN(i915, 6)) + engine->emit_fini_breadcrumb = gen6_xcs_emit_breadcrumb; + else + engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb; +} + +static void setup_vecs(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + + GEM_BUG_ON(INTEL_GEN(i915) < 7); + + engine->emit_flush = gen6_ring_flush; + engine->irq_enable_mask = PM_VEBOX_USER_INTERRUPT; + engine->irq_enable = hsw_vebox_irq_enable; + engine->irq_disable = hsw_vebox_irq_disable; + + engine->emit_fini_breadcrumb = gen7_xcs_emit_breadcrumb; +} + +int intel_ring_submission_setup(struct intel_engine_cs *engine) +{ + setup_common(engine); + + switch (engine->class) { + case RENDER_CLASS: + setup_rcs(engine); + break; + case VIDEO_DECODE_CLASS: + setup_vcs(engine); + break; + case COPY_ENGINE_CLASS: + setup_bcs(engine); + break; + case VIDEO_ENHANCEMENT_CLASS: + setup_vecs(engine); + break; + default: + MISSING_CASE(engine->class); + return -ENODEV; + } + + return 0; +} + +int intel_ring_submission_init(struct intel_engine_cs *engine) +{ + struct i915_timeline *timeline; + struct intel_ring *ring; + int err; + + timeline = i915_timeline_create(engine->i915, engine->status_page.vma); + if (IS_ERR(timeline)) { + err = PTR_ERR(timeline); + goto err; + } + GEM_BUG_ON(timeline->has_initial_breadcrumb); + + ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE); + i915_timeline_put(timeline); + if (IS_ERR(ring)) { + err = PTR_ERR(ring); + goto err; + } + + err = intel_ring_pin(ring); + if (err) + goto err_ring; + + GEM_BUG_ON(engine->buffer); + engine->buffer = ring; + + err = intel_engine_init_common(engine); + if (err) + goto err_unpin; + + GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma); + + return 0; + +err_unpin: + intel_ring_unpin(ring); +err_ring: + intel_ring_put(ring); +err: + intel_engine_cleanup_common(engine); + return err; +} diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c b/drivers/gpu/drm/i915/gt/intel_sseu.c new file mode 100644 index 000000000000..a0756f006f5f --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu.c @@ -0,0 +1,159 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_lrc_reg.h" +#include "intel_sseu.h" + +unsigned int +intel_sseu_subslice_total(const struct sseu_dev_info *sseu) +{ + unsigned int i, total = 0; + + for (i = 0; i < ARRAY_SIZE(sseu->subslice_mask); i++) + total += hweight8(sseu->subslice_mask[i]); + + return total; +} + +unsigned int +intel_sseu_subslices_per_slice(const struct sseu_dev_info *sseu, u8 slice) +{ + return hweight8(sseu->subslice_mask[slice]); +} + +u32 intel_sseu_make_rpcs(struct drm_i915_private *i915, + const struct intel_sseu *req_sseu) +{ + const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu; + bool subslice_pg = sseu->has_subslice_pg; + struct intel_sseu ctx_sseu; + u8 slices, subslices; + u32 rpcs = 0; + + /* + * No explicit RPCS request is needed to ensure full + * slice/subslice/EU enablement prior to Gen9. + */ + if (INTEL_GEN(i915) < 9) + return 0; + + /* + * If i915/perf is active, we want a stable powergating configuration + * on the system. + * + * We could choose full enablement, but on ICL we know there are use + * cases which disable slices for functional, apart for performance + * reasons. So in this case we select a known stable subset. + */ + if (!i915->perf.oa.exclusive_stream) { + ctx_sseu = *req_sseu; + } else { + ctx_sseu = intel_sseu_from_device_info(sseu); + + if (IS_GEN(i915, 11)) { + /* + * We only need subslice count so it doesn't matter + * which ones we select - just turn off low bits in the + * amount of half of all available subslices per slice. + */ + ctx_sseu.subslice_mask = + ~(~0 << (hweight8(ctx_sseu.subslice_mask) / 2)); + ctx_sseu.slice_mask = 0x1; + } + } + + slices = hweight8(ctx_sseu.slice_mask); + subslices = hweight8(ctx_sseu.subslice_mask); + + /* + * Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only three bits + * wide and Icelake has up to eight subslices, specfial programming is + * needed in order to correctly enable all subslices. + * + * According to documentation software must consider the configuration + * as 2x4x8 and hardware will translate this to 1x8x8. + * + * Furthemore, even though SScount is three bits, maximum documented + * value for it is four. From this some rules/restrictions follow: + * + * 1. + * If enabled subslice count is greater than four, two whole slices must + * be enabled instead. + * + * 2. + * When more than one slice is enabled, hardware ignores the subslice + * count altogether. + * + * From these restrictions it follows that it is not possible to enable + * a count of subslices between the SScount maximum of four restriction, + * and the maximum available number on a particular SKU. Either all + * subslices are enabled, or a count between one and four on the first + * slice. + */ + if (IS_GEN(i915, 11) && + slices == 1 && + subslices > min_t(u8, 4, hweight8(sseu->subslice_mask[0]) / 2)) { + GEM_BUG_ON(subslices & 1); + + subslice_pg = false; + slices *= 2; + } + + /* + * Starting in Gen9, render power gating can leave + * slice/subslice/EU in a partially enabled state. We + * must make an explicit request through RPCS for full + * enablement. + */ + if (sseu->has_slice_pg) { + u32 mask, val = slices; + + if (INTEL_GEN(i915) >= 11) { + mask = GEN11_RPCS_S_CNT_MASK; + val <<= GEN11_RPCS_S_CNT_SHIFT; + } else { + mask = GEN8_RPCS_S_CNT_MASK; + val <<= GEN8_RPCS_S_CNT_SHIFT; + } + + GEM_BUG_ON(val & ~mask); + val &= mask; + + rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_S_CNT_ENABLE | val; + } + + if (subslice_pg) { + u32 val = subslices; + + val <<= GEN8_RPCS_SS_CNT_SHIFT; + + GEM_BUG_ON(val & ~GEN8_RPCS_SS_CNT_MASK); + val &= GEN8_RPCS_SS_CNT_MASK; + + rpcs |= GEN8_RPCS_ENABLE | GEN8_RPCS_SS_CNT_ENABLE | val; + } + + if (sseu->has_eu_pg) { + u32 val; + + val = ctx_sseu.min_eus_per_subslice << GEN8_RPCS_EU_MIN_SHIFT; + GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK); + val &= GEN8_RPCS_EU_MIN_MASK; + + rpcs |= val; + + val = ctx_sseu.max_eus_per_subslice << GEN8_RPCS_EU_MAX_SHIFT; + GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK); + val &= GEN8_RPCS_EU_MAX_MASK; + + rpcs |= val; + + rpcs |= GEN8_RPCS_ENABLE; + } + + return rpcs; +} diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.h b/drivers/gpu/drm/i915/gt/intel_sseu.h new file mode 100644 index 000000000000..b50d0401a4e2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_sseu.h @@ -0,0 +1,75 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2019 Intel Corporation + */ + +#ifndef __INTEL_SSEU_H__ +#define __INTEL_SSEU_H__ + +#include <linux/types.h> +#include <linux/kernel.h> + +struct drm_i915_private; + +#define GEN_MAX_SLICES (6) /* CNL upper bound */ +#define GEN_MAX_SUBSLICES (8) /* ICL upper bound */ +#define GEN_SSEU_STRIDE(max_entries) DIV_ROUND_UP(max_entries, BITS_PER_BYTE) + +struct sseu_dev_info { + u8 slice_mask; + u8 subslice_mask[GEN_MAX_SLICES]; + u16 eu_total; + u8 eu_per_subslice; + u8 min_eu_in_pool; + /* For each slice, which subslice(s) has(have) 7 EUs (bitfield)? */ + u8 subslice_7eu[3]; + u8 has_slice_pg:1; + u8 has_subslice_pg:1; + u8 has_eu_pg:1; + + /* Topology fields */ + u8 max_slices; + u8 max_subslices; + u8 max_eus_per_subslice; + + /* We don't have more than 8 eus per subslice at the moment and as we + * store eus enabled using bits, no need to multiply by eus per + * subslice. + */ + u8 eu_mask[GEN_MAX_SLICES * GEN_MAX_SUBSLICES]; +}; + +/* + * Powergating configuration for a particular (context,engine). + */ +struct intel_sseu { + u8 slice_mask; + u8 subslice_mask; + u8 min_eus_per_subslice; + u8 max_eus_per_subslice; +}; + +static inline struct intel_sseu +intel_sseu_from_device_info(const struct sseu_dev_info *sseu) +{ + struct intel_sseu value = { + .slice_mask = sseu->slice_mask, + .subslice_mask = sseu->subslice_mask[0], + .min_eus_per_subslice = sseu->max_eus_per_subslice, + .max_eus_per_subslice = sseu->max_eus_per_subslice, + }; + + return value; +} + +unsigned int +intel_sseu_subslice_total(const struct sseu_dev_info *sseu); + +unsigned int +intel_sseu_subslices_per_slice(const struct sseu_dev_info *sseu, u8 slice); + +u32 intel_sseu_make_rpcs(struct drm_i915_private *i915, + const struct intel_sseu *req_sseu); + +#endif /* __INTEL_SSEU_H__ */ diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c new file mode 100644 index 000000000000..15e90fd2cfdc --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -0,0 +1,1478 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#include "i915_drv.h" +#include "intel_context.h" +#include "intel_workarounds.h" + +/** + * DOC: Hardware workarounds + * + * This file is intended as a central place to implement most [1]_ of the + * required workarounds for hardware to work as originally intended. They fall + * in five basic categories depending on how/when they are applied: + * + * - Workarounds that touch registers that are saved/restored to/from the HW + * context image. The list is emitted (via Load Register Immediate commands) + * everytime a new context is created. + * - GT workarounds. The list of these WAs is applied whenever these registers + * revert to default values (on GPU reset, suspend/resume [2]_, etc..). + * - Display workarounds. The list is applied during display clock-gating + * initialization. + * - Workarounds that whitelist a privileged register, so that UMDs can manage + * them directly. This is just a special case of a MMMIO workaround (as we + * write the list of these to/be-whitelisted registers to some special HW + * registers). + * - Workaround batchbuffers, that get executed automatically by the hardware + * on every HW context restore. + * + * .. [1] Please notice that there are other WAs that, due to their nature, + * cannot be applied from a central place. Those are peppered around the rest + * of the code, as needed. + * + * .. [2] Technically, some registers are powercontext saved & restored, so they + * survive a suspend/resume. In practice, writing them again is not too + * costly and simplifies things. We can revisit this in the future. + * + * Layout + * ~~~~~~ + * + * Keep things in this file ordered by WA type, as per the above (context, GT, + * display, register whitelist, batchbuffer). Then, inside each type, keep the + * following order: + * + * - Infrastructure functions and macros + * - WAs per platform in standard gen/chrono order + * - Public functions to init or apply the given workaround type. + */ + +static void wa_init_start(struct i915_wa_list *wal, const char *name) +{ + wal->name = name; +} + +#define WA_LIST_CHUNK (1 << 4) + +static void wa_init_finish(struct i915_wa_list *wal) +{ + /* Trim unused entries. */ + if (!IS_ALIGNED(wal->count, WA_LIST_CHUNK)) { + struct i915_wa *list = kmemdup(wal->list, + wal->count * sizeof(*list), + GFP_KERNEL); + + if (list) { + kfree(wal->list); + wal->list = list; + } + } + + if (!wal->count) + return; + + DRM_DEBUG_DRIVER("Initialized %u %s workarounds\n", + wal->wa_count, wal->name); +} + +static void _wa_add(struct i915_wa_list *wal, const struct i915_wa *wa) +{ + unsigned int addr = i915_mmio_reg_offset(wa->reg); + unsigned int start = 0, end = wal->count; + const unsigned int grow = WA_LIST_CHUNK; + struct i915_wa *wa_; + + GEM_BUG_ON(!is_power_of_2(grow)); + + if (IS_ALIGNED(wal->count, grow)) { /* Either uninitialized or full. */ + struct i915_wa *list; + + list = kmalloc_array(ALIGN(wal->count + 1, grow), sizeof(*wa), + GFP_KERNEL); + if (!list) { + DRM_ERROR("No space for workaround init!\n"); + return; + } + + if (wal->list) + memcpy(list, wal->list, sizeof(*wa) * wal->count); + + wal->list = list; + } + + while (start < end) { + unsigned int mid = start + (end - start) / 2; + + if (i915_mmio_reg_offset(wal->list[mid].reg) < addr) { + start = mid + 1; + } else if (i915_mmio_reg_offset(wal->list[mid].reg) > addr) { + end = mid; + } else { + wa_ = &wal->list[mid]; + + if ((wa->mask & ~wa_->mask) == 0) { + DRM_ERROR("Discarding overwritten w/a for reg %04x (mask: %08x, value: %08x)\n", + i915_mmio_reg_offset(wa_->reg), + wa_->mask, wa_->val); + + wa_->val &= ~wa->mask; + } + + wal->wa_count++; + wa_->val |= wa->val; + wa_->mask |= wa->mask; + wa_->read |= wa->read; + return; + } + } + + wal->wa_count++; + wa_ = &wal->list[wal->count++]; + *wa_ = *wa; + + while (wa_-- > wal->list) { + GEM_BUG_ON(i915_mmio_reg_offset(wa_[0].reg) == + i915_mmio_reg_offset(wa_[1].reg)); + if (i915_mmio_reg_offset(wa_[1].reg) > + i915_mmio_reg_offset(wa_[0].reg)) + break; + + swap(wa_[1], wa_[0]); + } +} + +static void +wa_write_masked_or(struct i915_wa_list *wal, i915_reg_t reg, u32 mask, + u32 val) +{ + struct i915_wa wa = { + .reg = reg, + .mask = mask, + .val = val, + .read = mask, + }; + + _wa_add(wal, &wa); +} + +static void +wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) +{ + wa_write_masked_or(wal, reg, val, _MASKED_BIT_ENABLE(val)); +} + +static void +wa_write(struct i915_wa_list *wal, i915_reg_t reg, u32 val) +{ + wa_write_masked_or(wal, reg, ~0, val); +} + +static void +wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 val) +{ + wa_write_masked_or(wal, reg, val, val); +} + +static void +ignore_wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 mask, u32 val) +{ + struct i915_wa wa = { + .reg = reg, + .mask = mask, + .val = val, + /* Bonkers HW, skip verifying */ + }; + + _wa_add(wal, &wa); +} + +#define WA_SET_BIT_MASKED(addr, mask) \ + wa_write_masked_or(wal, (addr), (mask), _MASKED_BIT_ENABLE(mask)) + +#define WA_CLR_BIT_MASKED(addr, mask) \ + wa_write_masked_or(wal, (addr), (mask), _MASKED_BIT_DISABLE(mask)) + +#define WA_SET_FIELD_MASKED(addr, mask, value) \ + wa_write_masked_or(wal, (addr), (mask), _MASKED_FIELD((mask), (value))) + +static void gen8_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + WA_SET_BIT_MASKED(INSTPM, INSTPM_FORCE_ORDERING); + + /* WaDisableAsyncFlipPerfMode:bdw,chv */ + WA_SET_BIT_MASKED(MI_MODE, ASYNC_FLIP_PERF_DISABLE); + + /* WaDisablePartialInstShootdown:bdw,chv */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + + /* Use Force Non-Coherent whenever executing a 3D context. This is a + * workaround for for a possible hang in the unlikely event a TLB + * invalidation occurs during a PSD flush. + */ + /* WaForceEnableNonCoherent:bdw,chv */ + /* WaHdcDisableFetchWhenMasked:bdw,chv */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_DONOT_FETCH_MEM_WHEN_MASKED | + HDC_FORCE_NON_COHERENT); + + /* From the Haswell PRM, Command Reference: Registers, CACHE_MODE_0: + * "The Hierarchical Z RAW Stall Optimization allows non-overlapping + * polygons in the same 8x4 pixel/sample area to be processed without + * stalling waiting for the earlier ones to write to Hierarchical Z + * buffer." + * + * This optimization is off by default for BDW and CHV; turn it on. + */ + WA_CLR_BIT_MASKED(CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + + /* Wa4x4STCOptimizationDisable:bdw,chv */ + WA_SET_BIT_MASKED(CACHE_MODE_1, GEN8_4x4_STC_OPTIMIZATION_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + WA_SET_FIELD_MASKED(GEN7_GT_MODE, + GEN6_WIZ_HASHING_MASK, + GEN6_WIZ_HASHING_16x4); +} + +static void bdw_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + gen8_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:bdw (pre-production) */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + + /* WaDisableDopClockGating:bdw + * + * Also see the related UCGTCL1 write in broadwell_init_clock_gating() + * to disable EUTC clock gating. + */ + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + DOP_CLOCK_GATING_DISABLE); + + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); + + WA_SET_BIT_MASKED(HDC_CHICKEN0, + /* WaForceContextSaveRestoreNonCoherent:bdw */ + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + /* WaDisableFenceDestinationToSLM:bdw (pre-prod) */ + (IS_BDW_GT3(i915) ? HDC_FENCE_DEST_SLM_DISABLE : 0)); +} + +static void chv_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen8_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:chv */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, STALL_DOP_GATING_DISABLE); + + /* Improve HiZ throughput on CHV. */ + WA_SET_BIT_MASKED(HIZ_CHICKEN, CHV_HZ_8X8_MODE_IN_1X); +} + +static void gen9_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + if (HAS_LLC(i915)) { + /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl + * + * Must match Display Engine. See + * WaCompressedResourceDisplayNewHashMode. + */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN9_PBE_COMPRESSED_HASH_SELECTION); + WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, + GEN9_SAMPLER_HASH_COMPRESSED_READ_ADDR); + } + + /* WaClearFlowControlGpgpuContextSave:skl,bxt,kbl,glk,cfl */ + /* WaDisablePartialInstShootdown:skl,bxt,kbl,glk,cfl */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + FLOW_CONTROL_ENABLE | + PARTIAL_INSTRUCTION_SHOOTDOWN_DISABLE); + + /* Syncing dependencies between camera and graphics:skl,bxt,kbl */ + if (!IS_COFFEELAKE(i915)) + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, + GEN9_DISABLE_OCL_OOB_SUPPRESS_LOGIC); + + /* WaEnableYV12BugFixInHalfSliceChicken7:skl,bxt,kbl,glk,cfl */ + /* WaEnableSamplerGPGPUPreemptionSupport:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN7, + GEN9_ENABLE_YV12_BUGFIX | + GEN9_ENABLE_GPGPU_PREEMPTION); + + /* Wa4x4STCOptimizationDisable:skl,bxt,kbl,glk,cfl */ + /* WaDisablePartialResolveInVc:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(CACHE_MODE_1, + GEN8_4x4_STC_OPTIMIZATION_DISABLE | + GEN9_PARTIAL_RESOLVE_IN_VC_DISABLE); + + /* WaCcsTlbPrefetchDisable:skl,bxt,kbl,glk,cfl */ + WA_CLR_BIT_MASKED(GEN9_HALF_SLICE_CHICKEN5, + GEN9_CCS_TLB_PREFETCH_ENABLE); + + /* WaForceContextSaveRestoreNonCoherent:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT | + HDC_FORCE_CSR_NON_COHERENT_OVR_DISABLE); + + /* WaForceEnableNonCoherent and WaDisableHDCInvalidation are + * both tied to WaForceContextSaveRestoreNonCoherent + * in some hsds for skl. We keep the tie for all gen9. The + * documentation is a bit hazy and so we want to get common behaviour, + * even though there is no clear evidence we would need both on kbl/bxt. + * This area has been source of system hangs so we play it safe + * and mimic the skl regardless of what bspec says. + * + * Use Force Non-Coherent whenever executing a 3D context. This + * is a workaround for a possible hang in the unlikely event + * a TLB invalidation occurs during a PSD flush. + */ + + /* WaForceEnableNonCoherent:skl,bxt,kbl,cfl */ + WA_SET_BIT_MASKED(HDC_CHICKEN0, + HDC_FORCE_NON_COHERENT); + + /* WaDisableSamplerPowerBypassForSOPingPong:skl,bxt,kbl,cfl */ + if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, + GEN8_SAMPLER_POWER_BYPASS_DIS); + + /* WaDisableSTUnitPowerOptimization:skl,bxt,kbl,glk,cfl */ + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN2, GEN8_ST_PO_DISABLE); + + /* + * Supporting preemption with fine-granularity requires changes in the + * batch buffer programming. Since we can't break old userspace, we + * need to set our default preemption level to safe value. Userspace is + * still able to use more fine-grained preemption levels, since in + * WaEnablePreemptionGranularityControlByUMD we're whitelisting the + * per-ctx register. As such, WaDisable{3D,GPGPU}MidCmdPreemption are + * not real HW workarounds, but merely a way to start using preemption + * while maintaining old contract with userspace. + */ + + /* WaDisable3DMidCmdPreemption:skl,bxt,glk,cfl,[cnl] */ + WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + + /* WaDisableGPGPUMidCmdPreemption:skl,bxt,blk,cfl,[cnl] */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); + + /* WaClearHIZ_WM_CHICKEN3:bxt,glk */ + if (IS_GEN9_LP(i915)) + WA_SET_BIT_MASKED(GEN9_WM_CHICKEN3, GEN9_FACTOR_IN_CLR_VAL_HIZ); +} + +static void skl_tune_iz_hashing(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + u8 vals[3] = { 0, 0, 0 }; + unsigned int i; + + for (i = 0; i < 3; i++) { + u8 ss; + + /* + * Only consider slices where one, and only one, subslice has 7 + * EUs + */ + if (!is_power_of_2(RUNTIME_INFO(i915)->sseu.subslice_7eu[i])) + continue; + + /* + * subslice_7eu[i] != 0 (because of the check above) and + * ss_max == 4 (maximum number of subslices possible per slice) + * + * -> 0 <= ss <= 3; + */ + ss = ffs(RUNTIME_INFO(i915)->sseu.subslice_7eu[i]) - 1; + vals[i] = 3 - ss; + } + + if (vals[0] == 0 && vals[1] == 0 && vals[2] == 0) + return; + + /* Tune IZ hashing. See intel_device_info_runtime_init() */ + WA_SET_FIELD_MASKED(GEN7_GT_MODE, + GEN9_IZ_HASHING_MASK(2) | + GEN9_IZ_HASHING_MASK(1) | + GEN9_IZ_HASHING_MASK(0), + GEN9_IZ_HASHING(2, vals[2]) | + GEN9_IZ_HASHING(1, vals[1]) | + GEN9_IZ_HASHING(0, vals[0])); +} + +static void skl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + skl_tune_iz_hashing(engine, wal); +} + +static void bxt_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaDisableThreadStallDopClockGating:bxt */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, + STALL_DOP_GATING_DISABLE); + + /* WaToEnableHwFixForPushConstHWBug:bxt */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); +} + +static void kbl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:kbl */ + if (IS_KBL_REVID(i915, KBL_REVID_C0, REVID_FOREVER)) + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaDisableSbeCacheDispatchPortSharing:kbl */ + WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); +} + +static void glk_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:glk */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); +} + +static void cfl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + gen9_ctx_workarounds_init(engine, wal); + + /* WaToEnableHwFixForPushConstHWBug:cfl */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaDisableSbeCacheDispatchPortSharing:cfl */ + WA_SET_BIT_MASKED(GEN7_HALF_SLICE_CHICKEN1, + GEN7_SBE_SS_CACHE_DISPATCH_PORT_SHARING_DISABLE); +} + +static void cnl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + /* WaForceContextSaveRestoreNonCoherent:cnl */ + WA_SET_BIT_MASKED(CNL_HDC_CHICKEN0, + HDC_FORCE_CONTEXT_SAVE_RESTORE_NON_COHERENT); + + /* WaThrottleEUPerfToAvoidTDBackPressure:cnl(pre-prod) */ + if (IS_CNL_REVID(i915, CNL_REVID_B0, CNL_REVID_B0)) + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, THROTTLE_12_5); + + /* WaDisableReplayBufferBankArbitrationOptimization:cnl */ + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_SBE_DISABLE_REPLAY_BUF_OPTIMIZATION); + + /* WaDisableEnhancedSBEVertexCaching:cnl (pre-prod) */ + if (IS_CNL_REVID(i915, 0, CNL_REVID_B0)) + WA_SET_BIT_MASKED(COMMON_SLICE_CHICKEN2, + GEN8_CSC2_SBE_VUE_CACHE_CONSERVATIVE); + + /* WaPushConstantDereferenceHoldDisable:cnl */ + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, PUSH_CONSTANT_DEREF_DISABLE); + + /* FtrEnableFastAnisoL1BankingFix:cnl */ + WA_SET_BIT_MASKED(HALF_SLICE_CHICKEN3, CNL_FAST_ANISO_L1_BANKING_FIX); + + /* WaDisable3DMidCmdPreemption:cnl */ + WA_CLR_BIT_MASKED(GEN8_CS_CHICKEN1, GEN9_PREEMPT_3D_OBJECT_LEVEL); + + /* WaDisableGPGPUMidCmdPreemption:cnl */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_COMMAND_LEVEL); + + /* WaDisableEarlyEOT:cnl */ + WA_SET_BIT_MASKED(GEN8_ROW_CHICKEN, DISABLE_EARLY_EOT); +} + +static void icl_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + /* WaDisableBankHangMode:icl */ + wa_write(wal, + GEN8_L3CNTLREG, + intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) | + GEN8_ERRDETBCTRL); + + /* WaDisableBankHangMode:icl */ + wa_write(wal, + GEN8_L3CNTLREG, + intel_uncore_read(engine->uncore, GEN8_L3CNTLREG) | + GEN8_ERRDETBCTRL); + + /* Wa_1604370585:icl (pre-prod) + * Formerly known as WaPushConstantDereferenceHoldDisable + */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + PUSH_CONSTANT_DEREF_DISABLE); + + /* WaForceEnableNonCoherent:icl + * This is not the same workaround as in early Gen9 platforms, where + * lacking this could cause system hangs, but coherency performance + * overhead is high and only a few compute workloads really need it + * (the register is whitelisted in hardware now, so UMDs can opt in + * for coherency if they have a good reason). + */ + WA_SET_BIT_MASKED(ICL_HDC_MODE, HDC_FORCE_NON_COHERENT); + + /* Wa_2006611047:icl (pre-prod) + * Formerly known as WaDisableImprovedTdlClkGating + */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + WA_SET_BIT_MASKED(GEN7_ROW_CHICKEN2, + GEN11_TDL_CLOCK_GATING_FIX_DISABLE); + + /* Wa_2006665173:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + WA_SET_BIT_MASKED(GEN11_COMMON_SLICE_CHICKEN3, + GEN11_BLEND_EMB_FIX_DISABLE_IN_RCC); + + /* WaEnableFloatBlendOptimization:icl */ + wa_write_masked_or(wal, + GEN10_CACHE_MODE_SS, + 0, /* write-only, so skip validation */ + _MASKED_BIT_ENABLE(FLOAT_BLEND_OPTIMIZATION_ENABLE)); + + /* WaDisableGPGPUMidThreadPreemption:icl */ + WA_SET_FIELD_MASKED(GEN8_CS_CHICKEN1, + GEN9_PREEMPT_GPGPU_LEVEL_MASK, + GEN9_PREEMPT_GPGPU_THREAD_GROUP_LEVEL); + + /* allow headerless messages for preemptible GPGPU context */ + WA_SET_BIT_MASKED(GEN10_SAMPLER_MODE, + GEN11_SAMPLER_ENABLE_HEADLESS_MSG); +} + +static void +__intel_engine_init_ctx_wa(struct intel_engine_cs *engine, + struct i915_wa_list *wal, + const char *name) +{ + struct drm_i915_private *i915 = engine->i915; + + if (engine->class != RENDER_CLASS) + return; + + wa_init_start(wal, name); + + if (IS_GEN(i915, 11)) + icl_ctx_workarounds_init(engine, wal); + else if (IS_CANNONLAKE(i915)) + cnl_ctx_workarounds_init(engine, wal); + else if (IS_COFFEELAKE(i915)) + cfl_ctx_workarounds_init(engine, wal); + else if (IS_GEMINILAKE(i915)) + glk_ctx_workarounds_init(engine, wal); + else if (IS_KABYLAKE(i915)) + kbl_ctx_workarounds_init(engine, wal); + else if (IS_BROXTON(i915)) + bxt_ctx_workarounds_init(engine, wal); + else if (IS_SKYLAKE(i915)) + skl_ctx_workarounds_init(engine, wal); + else if (IS_CHERRYVIEW(i915)) + chv_ctx_workarounds_init(engine, wal); + else if (IS_BROADWELL(i915)) + bdw_ctx_workarounds_init(engine, wal); + else if (INTEL_GEN(i915) < 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); + + wa_init_finish(wal); +} + +void intel_engine_init_ctx_wa(struct intel_engine_cs *engine) +{ + __intel_engine_init_ctx_wa(engine, &engine->ctx_wa_list, "context"); +} + +int intel_engine_emit_ctx_wa(struct i915_request *rq) +{ + struct i915_wa_list *wal = &rq->engine->ctx_wa_list; + struct i915_wa *wa; + unsigned int i; + u32 *cs; + int ret; + + if (wal->count == 0) + return 0; + + ret = rq->engine->emit_flush(rq, EMIT_BARRIER); + if (ret) + return ret; + + cs = intel_ring_begin(rq, (wal->count * 2 + 2)); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + *cs++ = MI_LOAD_REGISTER_IMM(wal->count); + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + *cs++ = i915_mmio_reg_offset(wa->reg); + *cs++ = wa->val; + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + + ret = rq->engine->emit_flush(rq, EMIT_BARRIER); + if (ret) + return ret; + + return 0; +} + +static void +gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableKillLogic:bxt,skl,kbl */ + if (!IS_COFFEELAKE(i915)) + wa_write_or(wal, + GAM_ECOCHK, + ECOCHK_DIS_TLB); + + if (HAS_LLC(i915)) { + /* WaCompressedResourceSamplerPbeMediaNewHashMode:skl,kbl + * + * Must match Display Engine. See + * WaCompressedResourceDisplayNewHashMode. + */ + wa_write_or(wal, + MMCD_MISC_CTRL, + MMCD_PCLA | MMCD_HOTSPOT_EN); + } + + /* WaDisableHDCInvalidation:skl,bxt,kbl,cfl */ + wa_write_or(wal, + GAM_ECOCHK, + BDW_DISABLE_HDC_INVALIDATION); +} + +static void +skl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableGafsUnitClkGating:skl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:skl */ + if (IS_SKL_REVID(i915, SKL_REVID_H0, REVID_FOREVER)) + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +bxt_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaInPlaceDecompressionHang:bxt */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +kbl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableDynamicCreditSharing:kbl */ + if (IS_KBL_REVID(i915, 0, KBL_REVID_B0)) + wa_write_or(wal, + GAMT_CHKN_BIT_REG, + GAMT_CHKN_DISABLE_DYNAMIC_CREDIT_SHARING); + + /* WaDisableGafsUnitClkGating:kbl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:kbl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +glk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); +} + +static void +cfl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen9_gt_workarounds_init(i915, wal); + + /* WaDisableGafsUnitClkGating:cfl */ + wa_write_or(wal, + GEN7_UCGCTL4, + GEN8_EU_GAUNIT_CLOCK_GATE_DISABLE); + + /* WaInPlaceDecompressionHang:cfl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +wa_init_mcr(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu; + u32 mcr_slice_subslice_mask; + + /* + * WaProgramMgsrForL3BankSpecificMmioReads: cnl,icl + * L3Banks could be fused off in single slice scenario. If that is + * the case, we might need to program MCR select to a valid L3Bank + * by default, to make sure we correctly read certain registers + * later on (in the range 0xB100 - 0xB3FF). + * This might be incompatible with + * WaProgramMgsrForCorrectSliceSpecificMmioReads. + * Fortunately, this should not happen in production hardware, so + * we only assert that this is the case (instead of implementing + * something more complex that requires checking the range of every + * MMIO read). + */ + if (INTEL_GEN(i915) >= 10 && + is_power_of_2(sseu->slice_mask)) { + /* + * read FUSE3 for enabled L3 Bank IDs, if L3 Bank matches + * enabled subslice, no need to redirect MCR packet + */ + u32 slice = fls(sseu->slice_mask); + u32 fuse3 = + intel_uncore_read(&i915->uncore, GEN10_MIRROR_FUSE3); + u8 ss_mask = sseu->subslice_mask[slice]; + + u8 enabled_mask = (ss_mask | ss_mask >> + GEN10_L3BANK_PAIR_COUNT) & GEN10_L3BANK_MASK; + u8 disabled_mask = fuse3 & GEN10_L3BANK_MASK; + + /* + * Production silicon should have matched L3Bank and + * subslice enabled + */ + WARN_ON((enabled_mask & disabled_mask) != enabled_mask); + } + + if (INTEL_GEN(i915) >= 11) + mcr_slice_subslice_mask = GEN11_MCR_SLICE_MASK | + GEN11_MCR_SUBSLICE_MASK; + else + mcr_slice_subslice_mask = GEN8_MCR_SLICE_MASK | + GEN8_MCR_SUBSLICE_MASK; + /* + * WaProgramMgsrForCorrectSliceSpecificMmioReads:cnl,icl + * Before any MMIO read into slice/subslice specific registers, MCR + * packet control register needs to be programmed to point to any + * enabled s/ss pair. Otherwise, incorrect values will be returned. + * This means each subsequent MMIO read will be forwarded to an + * specific s/ss combination, but this is OK since these registers + * are consistent across s/ss in almost all cases. In the rare + * occasions, such as INSTDONE, where this value is dependent + * on s/ss combo, the read should be done with read_subslice_reg. + */ + wa_write_masked_or(wal, + GEN8_MCR_SELECTOR, + mcr_slice_subslice_mask, + intel_calculate_mcr_s_ss_select(i915)); +} + +static void +cnl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + wa_init_mcr(i915, wal); + + /* WaDisableI2mCycleOnWRPort:cnl (pre-prod) */ + if (IS_CNL_REVID(i915, CNL_REVID_B0, CNL_REVID_B0)) + wa_write_or(wal, + GAMT_CHKN_BIT_REG, + GAMT_CHKN_DISABLE_I2M_CYCLE_ON_WR_PORT); + + /* WaInPlaceDecompressionHang:cnl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); +} + +static void +icl_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + wa_init_mcr(i915, wal); + + /* WaInPlaceDecompressionHang:icl */ + wa_write_or(wal, + GEN9_GAMT_ECO_REG_RW_IA, + GAMT_ECO_ENABLE_IN_PLACE_DECOMPRESS); + + /* WaModifyGamTlbPartitioning:icl */ + wa_write_masked_or(wal, + GEN11_GACB_PERF_CTRL, + GEN11_HASH_CTRL_MASK, + GEN11_HASH_CTRL_BIT0 | GEN11_HASH_CTRL_BIT4); + + /* Wa_1405766107:icl + * Formerly known as WaCL2SFHalfMaxAlloc + */ + wa_write_or(wal, + GEN11_LSN_UNSLCVC, + GEN11_LSN_UNSLCVC_GAFS_HALF_SF_MAXALLOC | + GEN11_LSN_UNSLCVC_GAFS_HALF_CL2_MAXALLOC); + + /* Wa_220166154:icl + * Formerly known as WaDisCtxReload + */ + wa_write_or(wal, + GEN8_GAMW_ECO_DEV_RW_IA, + GAMW_ECO_DEV_CTX_RELOAD_DISABLE); + + /* Wa_1405779004:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_A0)) + wa_write_or(wal, + SLICE_UNIT_LEVEL_CLKGATE, + MSCUNIT_CLKGATE_DIS); + + /* Wa_1406680159:icl */ + wa_write_or(wal, + SUBSLICE_UNIT_LEVEL_CLKGATE, + GWUNIT_CLKGATE_DIS); + + /* Wa_1406838659:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + wa_write_or(wal, + INF_UNIT_LEVEL_CLKGATE, + CGPSF_CLKGATE_DIS); + + /* Wa_1406463099:icl + * Formerly known as WaGamTlbPendError + */ + wa_write_or(wal, + GAMT_CHKN_BIT_REG, + GAMT_CHKN_DISABLE_L3_COH_PIPE); +} + +static void +gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + if (IS_GEN(i915, 11)) + icl_gt_workarounds_init(i915, wal); + else if (IS_CANNONLAKE(i915)) + cnl_gt_workarounds_init(i915, wal); + else if (IS_COFFEELAKE(i915)) + cfl_gt_workarounds_init(i915, wal); + else if (IS_GEMINILAKE(i915)) + glk_gt_workarounds_init(i915, wal); + else if (IS_KABYLAKE(i915)) + kbl_gt_workarounds_init(i915, wal); + else if (IS_BROXTON(i915)) + bxt_gt_workarounds_init(i915, wal); + else if (IS_SKYLAKE(i915)) + skl_gt_workarounds_init(i915, wal); + else if (INTEL_GEN(i915) <= 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); +} + +void intel_gt_init_workarounds(struct drm_i915_private *i915) +{ + struct i915_wa_list *wal = &i915->gt_wa_list; + + wa_init_start(wal, "GT"); + gt_init_workarounds(i915, wal); + wa_init_finish(wal); +} + +static enum forcewake_domains +wal_get_fw_for_rmw(struct intel_uncore *uncore, const struct i915_wa_list *wal) +{ + enum forcewake_domains fw = 0; + struct i915_wa *wa; + unsigned int i; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + fw |= intel_uncore_forcewake_for_reg(uncore, + wa->reg, + FW_REG_READ | + FW_REG_WRITE); + + return fw; +} + +static bool +wa_verify(const struct i915_wa *wa, u32 cur, const char *name, const char *from) +{ + if ((cur ^ wa->val) & wa->read) { + DRM_ERROR("%s workaround lost on %s! (%x=%x/%x, expected %x, mask=%x)\n", + name, from, i915_mmio_reg_offset(wa->reg), + cur, cur & wa->read, + wa->val, wa->mask); + + return false; + } + + return true; +} + +static void +wa_list_apply(struct intel_uncore *uncore, const struct i915_wa_list *wal) +{ + enum forcewake_domains fw; + unsigned long flags; + struct i915_wa *wa; + unsigned int i; + + if (!wal->count) + return; + + fw = wal_get_fw_for_rmw(uncore, wal); + + spin_lock_irqsave(&uncore->lock, flags); + intel_uncore_forcewake_get__locked(uncore, fw); + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + intel_uncore_rmw_fw(uncore, wa->reg, wa->mask, wa->val); + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + wa_verify(wa, + intel_uncore_read_fw(uncore, wa->reg), + wal->name, "application"); + } + + intel_uncore_forcewake_put__locked(uncore, fw); + spin_unlock_irqrestore(&uncore->lock, flags); +} + +void intel_gt_apply_workarounds(struct drm_i915_private *i915) +{ + wa_list_apply(&i915->uncore, &i915->gt_wa_list); +} + +static bool wa_list_verify(struct intel_uncore *uncore, + const struct i915_wa_list *wal, + const char *from) +{ + struct i915_wa *wa; + unsigned int i; + bool ok = true; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + ok &= wa_verify(wa, + intel_uncore_read(uncore, wa->reg), + wal->name, from); + + return ok; +} + +bool intel_gt_verify_workarounds(struct drm_i915_private *i915, + const char *from) +{ + return wa_list_verify(&i915->uncore, &i915->gt_wa_list, from); +} + +static void +whitelist_reg_ext(struct i915_wa_list *wal, i915_reg_t reg, u32 flags) +{ + struct i915_wa wa = { + .reg = reg + }; + + if (GEM_DEBUG_WARN_ON(wal->count >= RING_MAX_NONPRIV_SLOTS)) + return; + + wa.reg.reg |= flags; + _wa_add(wal, &wa); +} + +static void +whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg) +{ + whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_RW); +} + +static void gen9_whitelist_build(struct i915_wa_list *w) +{ + /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */ + whitelist_reg(w, GEN9_CTX_PREEMPT_REG); + + /* WaEnablePreemptionGranularityControlByUMD:skl,bxt,kbl,cfl,[cnl] */ + whitelist_reg(w, GEN8_CS_CHICKEN1); + + /* WaAllowUMDToModifyHDCChicken1:skl,bxt,kbl,glk,cfl */ + whitelist_reg(w, GEN8_HDC_CHICKEN1); +} + +static void skl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WaDisableLSQCROPERFforOCL:skl */ + whitelist_reg(w, GEN8_L3SQCREG4); +} + +static void bxt_whitelist_build(struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(&engine->whitelist); +} + +static void kbl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WaDisableLSQCROPERFforOCL:kbl */ + whitelist_reg(w, GEN8_L3SQCREG4); +} + +static void glk_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(w); + + /* WA #0862: Userspace has to set "Barrier Mode" to avoid hangs. */ + whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); +} + +static void cfl_whitelist_build(struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return; + + gen9_whitelist_build(&engine->whitelist); +} + +static void cnl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + if (engine->class != RENDER_CLASS) + return; + + /* WaEnablePreemptionGranularityControlByUMD:cnl */ + whitelist_reg(w, GEN8_CS_CHICKEN1); +} + +static void icl_whitelist_build(struct intel_engine_cs *engine) +{ + struct i915_wa_list *w = &engine->whitelist; + + switch (engine->class) { + case RENDER_CLASS: + /* WaAllowUMDToModifyHalfSliceChicken7:icl */ + whitelist_reg(w, GEN9_HALF_SLICE_CHICKEN7); + + /* WaAllowUMDToModifySamplerMode:icl */ + whitelist_reg(w, GEN10_SAMPLER_MODE); + + /* WaEnableStateCacheRedirectToCS:icl */ + whitelist_reg(w, GEN9_SLICE_COMMON_ECO_CHICKEN1); + break; + + case VIDEO_DECODE_CLASS: + /* hucStatusRegOffset */ + whitelist_reg_ext(w, _MMIO(0x2000 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_RD); + /* hucUKernelHdrInfoRegOffset */ + whitelist_reg_ext(w, _MMIO(0x2014 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_RD); + /* hucStatus2RegOffset */ + whitelist_reg_ext(w, _MMIO(0x23B0 + engine->mmio_base), + RING_FORCE_TO_NONPRIV_RD); + break; + + default: + break; + } +} + +void intel_engine_init_whitelist(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_wa_list *w = &engine->whitelist; + + wa_init_start(w, "whitelist"); + + if (IS_GEN(i915, 11)) + icl_whitelist_build(engine); + else if (IS_CANNONLAKE(i915)) + cnl_whitelist_build(engine); + else if (IS_COFFEELAKE(i915)) + cfl_whitelist_build(engine); + else if (IS_GEMINILAKE(i915)) + glk_whitelist_build(engine); + else if (IS_KABYLAKE(i915)) + kbl_whitelist_build(engine); + else if (IS_BROXTON(i915)) + bxt_whitelist_build(engine); + else if (IS_SKYLAKE(i915)) + skl_whitelist_build(engine); + else if (INTEL_GEN(i915) <= 8) + return; + else + MISSING_CASE(INTEL_GEN(i915)); + + wa_init_finish(w); +} + +void intel_engine_apply_whitelist(struct intel_engine_cs *engine) +{ + const struct i915_wa_list *wal = &engine->whitelist; + struct intel_uncore *uncore = engine->uncore; + const u32 base = engine->mmio_base; + struct i915_wa *wa; + unsigned int i; + + if (!wal->count) + return; + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + intel_uncore_write(uncore, + RING_FORCE_TO_NONPRIV(base, i), + i915_mmio_reg_offset(wa->reg)); + + /* And clear the rest just in case of garbage */ + for (; i < RING_MAX_NONPRIV_SLOTS; i++) + intel_uncore_write(uncore, + RING_FORCE_TO_NONPRIV(base, i), + i915_mmio_reg_offset(RING_NOPID(base))); +} + +static void +rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + if (IS_GEN(i915, 11)) { + /* This is not an Wa. Enable for better image quality */ + wa_masked_en(wal, + _3D_CHICKEN3, + _3D_CHICKEN3_AA_LINE_QUALITY_FIX_ENABLE); + + /* WaPipelineFlushCoherentLines:icl */ + ignore_wa_write_or(wal, + GEN8_L3SQCREG4, + GEN8_LQSC_FLUSH_COHERENT_LINES, + GEN8_LQSC_FLUSH_COHERENT_LINES); + + /* + * Wa_1405543622:icl + * Formerly known as WaGAPZPriorityScheme + */ + wa_write_or(wal, + GEN8_GARBCNTL, + GEN11_ARBITRATION_PRIO_ORDER_MASK); + + /* + * Wa_1604223664:icl + * Formerly known as WaL3BankAddressHashing + */ + wa_write_masked_or(wal, + GEN8_GARBCNTL, + GEN11_HASH_CTRL_EXCL_MASK, + GEN11_HASH_CTRL_EXCL_BIT0); + wa_write_masked_or(wal, + GEN11_GLBLINVL, + GEN11_BANK_HASH_ADDR_EXCL_MASK, + GEN11_BANK_HASH_ADDR_EXCL_BIT0); + + /* + * Wa_1405733216:icl + * Formerly known as WaDisableCleanEvicts + */ + ignore_wa_write_or(wal, + GEN8_L3SQCREG4, + GEN11_LQSC_CLEAN_EVICT_DISABLE, + GEN11_LQSC_CLEAN_EVICT_DISABLE); + + /* WaForwardProgressSoftReset:icl */ + wa_write_or(wal, + GEN10_SCRATCH_LNCF2, + PMFLUSHDONE_LNICRSDROP | + PMFLUSH_GAPL3UNBLOCK | + PMFLUSHDONE_LNEBLK); + + /* Wa_1406609255:icl (pre-prod) */ + if (IS_ICL_REVID(i915, ICL_REVID_A0, ICL_REVID_B0)) + wa_write_or(wal, + GEN7_SARCHKMD, + GEN7_DISABLE_DEMAND_PREFETCH | + GEN7_DISABLE_SAMPLER_PREFETCH); + } + + if (IS_GEN_RANGE(i915, 9, 11)) { + /* FtrPerCtxtPreemptionGranularityControl:skl,bxt,kbl,cfl,cnl,icl */ + wa_masked_en(wal, + GEN7_FF_SLICE_CS_CHICKEN1, + GEN9_FFSC_PERCTX_PREEMPT_CTRL); + } + + if (IS_SKYLAKE(i915) || IS_KABYLAKE(i915) || IS_COFFEELAKE(i915)) { + /* WaEnableGapsTsvCreditFix:skl,kbl,cfl */ + wa_write_or(wal, + GEN8_GARBCNTL, + GEN9_GAPS_TSV_CREDIT_DISABLE); + } + + if (IS_BROXTON(i915)) { + /* WaDisablePooledEuLoadBalancingFix:bxt */ + wa_masked_en(wal, + FF_SLICE_CS_CHICKEN2, + GEN9_POOLED_EU_LOAD_BALANCING_FIX_DISABLE); + } + + if (IS_GEN(i915, 9)) { + /* WaContextSwitchWithConcurrentTLBInvalidate:skl,bxt,kbl,glk,cfl */ + wa_masked_en(wal, + GEN9_CSFE_CHICKEN1_RCS, + GEN9_PREEMPT_GPGPU_SYNC_SWITCH_DISABLE); + + /* WaEnableLbsSlaRetryTimerDecrement:skl,bxt,kbl,glk,cfl */ + wa_write_or(wal, + BDW_SCRATCH1, + GEN9_LBS_SLA_RETRY_TIMER_DECREMENT_ENABLE); + + /* WaProgramL3SqcReg1DefaultForPerf:bxt,glk */ + if (IS_GEN9_LP(i915)) + wa_write_masked_or(wal, + GEN8_L3SQCREG1, + L3_PRIO_CREDITS_MASK, + L3_GENERAL_PRIO_CREDITS(62) | + L3_HIGH_PRIO_CREDITS(2)); + + /* WaOCLCoherentLineFlush:skl,bxt,kbl,cfl */ + wa_write_or(wal, + GEN8_L3SQCREG4, + GEN8_LQSC_FLUSH_COHERENT_LINES); + } +} + +static void +xcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + struct drm_i915_private *i915 = engine->i915; + + /* WaKBLVECSSemaphoreWaitPoll:kbl */ + if (IS_KBL_REVID(i915, KBL_REVID_A0, KBL_REVID_E0)) { + wa_write(wal, + RING_SEMA_WAIT_POLL(engine->mmio_base), + 1); + } +} + +static void +engine_init_workarounds(struct intel_engine_cs *engine, struct i915_wa_list *wal) +{ + if (I915_SELFTEST_ONLY(INTEL_GEN(engine->i915) < 8)) + return; + + if (engine->id == RCS0) + rcs_engine_wa_init(engine, wal); + else + xcs_engine_wa_init(engine, wal); +} + +void intel_engine_init_workarounds(struct intel_engine_cs *engine) +{ + struct i915_wa_list *wal = &engine->wa_list; + + if (GEM_WARN_ON(INTEL_GEN(engine->i915) < 8)) + return; + + wa_init_start(wal, engine->name); + engine_init_workarounds(engine, wal); + wa_init_finish(wal); +} + +void intel_engine_apply_workarounds(struct intel_engine_cs *engine) +{ + wa_list_apply(engine->uncore, &engine->wa_list); +} + +static struct i915_vma * +create_scratch(struct i915_address_space *vm, int count) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + unsigned int size; + int err; + + size = round_up(count * sizeof(u32), PAGE_SIZE); + obj = i915_gem_object_create_internal(vm->i915, size); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); + + vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, + i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER); + if (err) + goto err_obj; + + return vma; + +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static int +wa_list_srm(struct i915_request *rq, + const struct i915_wa_list *wal, + struct i915_vma *vma) +{ + const struct i915_wa *wa; + unsigned int i; + u32 srm, *cs; + + srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + if (INTEL_GEN(rq->i915) >= 8) + srm++; + + cs = intel_ring_begin(rq, 4 * wal->count); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) { + *cs++ = srm; + *cs++ = i915_mmio_reg_offset(wa->reg); + *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i; + *cs++ = 0; + } + intel_ring_advance(rq, cs); + + return 0; +} + +static int engine_wa_list_verify(struct intel_context *ce, + const struct i915_wa_list * const wal, + const char *from) +{ + const struct i915_wa *wa; + struct i915_request *rq; + struct i915_vma *vma; + unsigned int i; + u32 *results; + int err; + + if (!wal->count) + return 0; + + vma = create_scratch(&ce->engine->i915->ggtt.vm, wal->count); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + rq = intel_context_create_request(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_vma; + } + + err = wa_list_srm(rq, wal, vma); + if (err) + goto err_vma; + + i915_request_add(rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + err = -ETIME; + goto err_vma; + } + + results = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); + if (IS_ERR(results)) { + err = PTR_ERR(results); + goto err_vma; + } + + err = 0; + for (i = 0, wa = wal->list; i < wal->count; i++, wa++) + if (!wa_verify(wa, results[i], wal->name, from)) + err = -ENXIO; + + i915_gem_object_unpin_map(vma->obj); + +err_vma: + i915_vma_unpin(vma); + i915_vma_put(vma); + return err; +} + +int intel_engine_verify_workarounds(struct intel_engine_cs *engine, + const char *from) +{ + return engine_wa_list_verify(engine->kernel_context, + &engine->wa_list, + from); +} + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_workarounds.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.h b/drivers/gpu/drm/i915/gt/intel_workarounds.h new file mode 100644 index 000000000000..3761a6ee58bb --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.h @@ -0,0 +1,40 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef _INTEL_WORKAROUNDS_H_ +#define _INTEL_WORKAROUNDS_H_ + +#include <linux/slab.h> + +#include "intel_workarounds_types.h" + +struct drm_i915_private; +struct i915_request; +struct intel_engine_cs; + +static inline void intel_wa_list_free(struct i915_wa_list *wal) +{ + kfree(wal->list); + memset(wal, 0, sizeof(*wal)); +} + +void intel_engine_init_ctx_wa(struct intel_engine_cs *engine); +int intel_engine_emit_ctx_wa(struct i915_request *rq); + +void intel_gt_init_workarounds(struct drm_i915_private *i915); +void intel_gt_apply_workarounds(struct drm_i915_private *i915); +bool intel_gt_verify_workarounds(struct drm_i915_private *i915, + const char *from); + +void intel_engine_init_whitelist(struct intel_engine_cs *engine); +void intel_engine_apply_whitelist(struct intel_engine_cs *engine); + +void intel_engine_init_workarounds(struct intel_engine_cs *engine); +void intel_engine_apply_workarounds(struct intel_engine_cs *engine); +int intel_engine_verify_workarounds(struct intel_engine_cs *engine, + const char *from); + +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds_types.h b/drivers/gpu/drm/i915/gt/intel_workarounds_types.h new file mode 100644 index 000000000000..42ac1fb99572 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/intel_workarounds_types.h @@ -0,0 +1,28 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2014-2018 Intel Corporation + */ + +#ifndef __INTEL_WORKAROUNDS_TYPES_H__ +#define __INTEL_WORKAROUNDS_TYPES_H__ + +#include <linux/types.h> + +#include "i915_reg.h" + +struct i915_wa { + i915_reg_t reg; + u32 mask; + u32 val; + u32 read; +}; + +struct i915_wa_list { + const char *name; + struct i915_wa *list; + unsigned int count; + unsigned int wa_count; +}; + +#endif /* __INTEL_WORKAROUNDS_TYPES_H__ */ diff --git a/drivers/gpu/drm/i915/gt/mock_engine.c b/drivers/gpu/drm/i915/gt/mock_engine.c new file mode 100644 index 000000000000..086801b51441 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/mock_engine.c @@ -0,0 +1,340 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include "gem/i915_gem_context.h" + +#include "i915_drv.h" +#include "intel_context.h" +#include "intel_engine_pm.h" + +#include "mock_engine.h" +#include "selftests/mock_request.h" + +struct mock_ring { + struct intel_ring base; + struct i915_timeline timeline; +}; + +static void mock_timeline_pin(struct i915_timeline *tl) +{ + tl->pin_count++; +} + +static void mock_timeline_unpin(struct i915_timeline *tl) +{ + GEM_BUG_ON(!tl->pin_count); + tl->pin_count--; +} + +static struct intel_ring *mock_ring(struct intel_engine_cs *engine) +{ + const unsigned long sz = PAGE_SIZE / 2; + struct mock_ring *ring; + + ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL); + if (!ring) + return NULL; + + if (i915_timeline_init(engine->i915, &ring->timeline, NULL)) { + kfree(ring); + return NULL; + } + + kref_init(&ring->base.ref); + ring->base.size = sz; + ring->base.effective_size = sz; + ring->base.vaddr = (void *)(ring + 1); + ring->base.timeline = &ring->timeline; + + INIT_LIST_HEAD(&ring->base.request_list); + intel_ring_update_space(&ring->base); + + return &ring->base; +} + +static void mock_ring_free(struct intel_ring *base) +{ + struct mock_ring *ring = container_of(base, typeof(*ring), base); + + i915_timeline_fini(&ring->timeline); + kfree(ring); +} + +static struct i915_request *first_request(struct mock_engine *engine) +{ + return list_first_entry_or_null(&engine->hw_queue, + struct i915_request, + mock.link); +} + +static void advance(struct i915_request *request) +{ + list_del_init(&request->mock.link); + i915_request_mark_complete(request); + GEM_BUG_ON(!i915_request_completed(request)); + + intel_engine_queue_breadcrumbs(request->engine); +} + +static void hw_delay_complete(struct timer_list *t) +{ + struct mock_engine *engine = from_timer(engine, t, hw_delay); + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->hw_lock, flags); + + /* Timer fired, first request is complete */ + request = first_request(engine); + if (request) + advance(request); + + /* + * Also immediately signal any subsequent 0-delay requests, but + * requeue the timer for the next delayed request. + */ + while ((request = first_request(engine))) { + if (request->mock.delay) { + mod_timer(&engine->hw_delay, + jiffies + request->mock.delay); + break; + } + + advance(request); + } + + spin_unlock_irqrestore(&engine->hw_lock, flags); +} + +static void mock_context_unpin(struct intel_context *ce) +{ + mock_timeline_unpin(ce->ring->timeline); +} + +static void mock_context_destroy(struct kref *ref) +{ + struct intel_context *ce = container_of(ref, typeof(*ce), ref); + + GEM_BUG_ON(intel_context_is_pinned(ce)); + + if (ce->ring) + mock_ring_free(ce->ring); + + intel_context_free(ce); +} + +static int mock_context_pin(struct intel_context *ce) +{ + int ret; + + if (!ce->ring) { + ce->ring = mock_ring(ce->engine); + if (!ce->ring) + return -ENOMEM; + } + + ret = intel_context_active_acquire(ce, PIN_HIGH); + if (ret) + return ret; + + mock_timeline_pin(ce->ring->timeline); + return 0; +} + +static const struct intel_context_ops mock_context_ops = { + .pin = mock_context_pin, + .unpin = mock_context_unpin, + + .enter = intel_context_enter_engine, + .exit = intel_context_exit_engine, + + .destroy = mock_context_destroy, +}; + +static int mock_request_alloc(struct i915_request *request) +{ + INIT_LIST_HEAD(&request->mock.link); + request->mock.delay = 0; + + return 0; +} + +static int mock_emit_flush(struct i915_request *request, + unsigned int flags) +{ + return 0; +} + +static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs) +{ + return cs; +} + +static void mock_submit_request(struct i915_request *request) +{ + struct mock_engine *engine = + container_of(request->engine, typeof(*engine), base); + unsigned long flags; + + i915_request_submit(request); + + spin_lock_irqsave(&engine->hw_lock, flags); + list_add_tail(&request->mock.link, &engine->hw_queue); + if (list_is_first(&request->mock.link, &engine->hw_queue)) { + if (request->mock.delay) + mod_timer(&engine->hw_delay, + jiffies + request->mock.delay); + else + advance(request); + } + spin_unlock_irqrestore(&engine->hw_lock, flags); +} + +static void mock_reset_prepare(struct intel_engine_cs *engine) +{ +} + +static void mock_reset(struct intel_engine_cs *engine, bool stalled) +{ + GEM_BUG_ON(stalled); +} + +static void mock_reset_finish(struct intel_engine_cs *engine) +{ +} + +static void mock_cancel_requests(struct intel_engine_cs *engine) +{ + struct i915_request *request; + unsigned long flags; + + spin_lock_irqsave(&engine->active.lock, flags); + + /* Mark all submitted requests as skipped. */ + list_for_each_entry(request, &engine->active.requests, sched.link) { + if (!i915_request_signaled(request)) + dma_fence_set_error(&request->fence, -EIO); + + i915_request_mark_complete(request); + } + + spin_unlock_irqrestore(&engine->active.lock, flags); +} + +struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, + const char *name, + int id) +{ + struct mock_engine *engine; + + GEM_BUG_ON(id >= I915_NUM_ENGINES); + + engine = kzalloc(sizeof(*engine) + PAGE_SIZE, GFP_KERNEL); + if (!engine) + return NULL; + + /* minimal engine setup for requests */ + engine->base.i915 = i915; + snprintf(engine->base.name, sizeof(engine->base.name), "%s", name); + engine->base.id = id; + engine->base.mask = BIT(id); + engine->base.status_page.addr = (void *)(engine + 1); + + engine->base.cops = &mock_context_ops; + engine->base.request_alloc = mock_request_alloc; + engine->base.emit_flush = mock_emit_flush; + engine->base.emit_fini_breadcrumb = mock_emit_breadcrumb; + engine->base.submit_request = mock_submit_request; + + engine->base.reset.prepare = mock_reset_prepare; + engine->base.reset.reset = mock_reset; + engine->base.reset.finish = mock_reset_finish; + engine->base.cancel_requests = mock_cancel_requests; + + /* fake hw queue */ + spin_lock_init(&engine->hw_lock); + timer_setup(&engine->hw_delay, hw_delay_complete, 0); + INIT_LIST_HEAD(&engine->hw_queue); + + return &engine->base; +} + +int mock_engine_init(struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = engine->i915; + int err; + + intel_engine_init_active(engine, ENGINE_MOCK); + intel_engine_init_breadcrumbs(engine); + intel_engine_init_execlists(engine); + intel_engine_init__pm(engine); + + engine->kernel_context = + i915_gem_context_get_engine(i915->kernel_context, engine->id); + if (IS_ERR(engine->kernel_context)) + goto err_breadcrumbs; + + err = intel_context_pin(engine->kernel_context); + intel_context_put(engine->kernel_context); + if (err) + goto err_breadcrumbs; + + return 0; + +err_breadcrumbs: + intel_engine_fini_breadcrumbs(engine); + return -ENOMEM; +} + +void mock_engine_flush(struct intel_engine_cs *engine) +{ + struct mock_engine *mock = + container_of(engine, typeof(*mock), base); + struct i915_request *request, *rn; + + del_timer_sync(&mock->hw_delay); + + spin_lock_irq(&mock->hw_lock); + list_for_each_entry_safe(request, rn, &mock->hw_queue, mock.link) + advance(request); + spin_unlock_irq(&mock->hw_lock); +} + +void mock_engine_reset(struct intel_engine_cs *engine) +{ +} + +void mock_engine_free(struct intel_engine_cs *engine) +{ + struct mock_engine *mock = + container_of(engine, typeof(*mock), base); + + GEM_BUG_ON(timer_pending(&mock->hw_delay)); + + intel_context_unpin(engine->kernel_context); + + intel_engine_fini_breadcrumbs(engine); + + kfree(engine); +} diff --git a/drivers/gpu/drm/i915/gt/mock_engine.h b/drivers/gpu/drm/i915/gt/mock_engine.h new file mode 100644 index 000000000000..3f9b698c49d2 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/mock_engine.h @@ -0,0 +1,51 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef __MOCK_ENGINE_H__ +#define __MOCK_ENGINE_H__ + +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/timer.h> + +#include "gt/intel_engine.h" + +struct mock_engine { + struct intel_engine_cs base; + + spinlock_t hw_lock; + struct list_head hw_queue; + struct timer_list hw_delay; +}; + +struct intel_engine_cs *mock_engine(struct drm_i915_private *i915, + const char *name, + int id); +int mock_engine_init(struct intel_engine_cs *engine); + +void mock_engine_flush(struct intel_engine_cs *engine); +void mock_engine_reset(struct intel_engine_cs *engine); +void mock_engine_free(struct intel_engine_cs *engine); + +#endif /* !__MOCK_ENGINE_H__ */ diff --git a/drivers/gpu/drm/i915/gt/selftest_engine_cs.c b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c new file mode 100644 index 000000000000..cfaa6b296835 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_engine_cs.c @@ -0,0 +1,58 @@ +/* + * SPDX-License-Identifier: GPL-2.0 + * + * Copyright © 2018 Intel Corporation + */ + +#include "../i915_selftest.h" + +static int intel_mmio_bases_check(void *arg) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(intel_engines); i++) { + const struct engine_info *info = &intel_engines[i]; + char name[INTEL_ENGINE_CS_MAX_NAME]; + u8 prev = U8_MAX; + + __sprint_engine_name(name, info); + + for (j = 0; j < MAX_MMIO_BASES; j++) { + u8 gen = info->mmio_bases[j].gen; + u32 base = info->mmio_bases[j].base; + + if (gen >= prev) { + pr_err("%s: %s: mmio base for gen %x " + "is before the one for gen %x\n", + __func__, name, prev, gen); + return -EINVAL; + } + + if (gen == 0) + break; + + if (!base) { + pr_err("%s: %s: invalid mmio base (%x) " + "for gen %x at entry %u\n", + __func__, name, base, gen, j); + return -EINVAL; + } + + prev = gen; + } + + pr_info("%s: min gen supported for %s = %d\n", + __func__, name, prev); + } + + return 0; +} + +int intel_engine_cs_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(intel_mmio_bases_check), + }; + + return i915_subtests(tests, NULL); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c new file mode 100644 index 000000000000..1ee4c923044f --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -0,0 +1,1763 @@ +/* + * Copyright © 2016 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#include <linux/kthread.h> + +#include "gem/i915_gem_context.h" +#include "intel_engine_pm.h" + +#include "i915_selftest.h" +#include "selftests/i915_random.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_wedge_me.h" +#include "selftests/igt_atomic.h" + +#include "selftests/mock_drm.h" + +#include "gem/selftests/mock_context.h" +#include "gem/selftests/igt_gem_utils.h" + +#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */ + +struct hang { + struct drm_i915_private *i915; + struct drm_i915_gem_object *hws; + struct drm_i915_gem_object *obj; + struct i915_gem_context *ctx; + u32 *seqno; + u32 *batch; +}; + +static int hang_init(struct hang *h, struct drm_i915_private *i915) +{ + void *vaddr; + int err; + + memset(h, 0, sizeof(*h)); + h->i915 = i915; + + h->ctx = kernel_context(i915); + if (IS_ERR(h->ctx)) + return PTR_ERR(h->ctx); + + GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx)); + + h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(h->hws)) { + err = PTR_ERR(h->hws); + goto err_ctx; + } + + h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(h->obj)) { + err = PTR_ERR(h->obj); + goto err_hws; + } + + i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC); + vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_obj; + } + h->seqno = memset(vaddr, 0xff, PAGE_SIZE); + + vaddr = i915_gem_object_pin_map(h->obj, + i915_coherent_map_type(i915)); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto err_unpin_hws; + } + h->batch = vaddr; + + return 0; + +err_unpin_hws: + i915_gem_object_unpin_map(h->hws); +err_obj: + i915_gem_object_put(h->obj); +err_hws: + i915_gem_object_put(h->hws); +err_ctx: + kernel_context_close(h->ctx); + return err; +} + +static u64 hws_address(const struct i915_vma *hws, + const struct i915_request *rq) +{ + return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context); +} + +static int move_to_active(struct i915_vma *vma, + struct i915_request *rq, + unsigned int flags) +{ + int err; + + i915_vma_lock(vma); + err = i915_vma_move_to_active(vma, rq, flags); + i915_vma_unlock(vma); + + return err; +} + +static struct i915_request * +hang_create_request(struct hang *h, struct intel_engine_cs *engine) +{ + struct drm_i915_private *i915 = h->i915; + struct i915_address_space *vm = h->ctx->vm ?: &i915->ggtt.vm; + struct i915_request *rq = NULL; + struct i915_vma *hws, *vma; + unsigned int flags; + u32 *batch; + int err; + + if (i915_gem_object_is_active(h->obj)) { + struct drm_i915_gem_object *obj; + void *vaddr; + + obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vaddr = i915_gem_object_pin_map(obj, + i915_coherent_map_type(h->i915)); + if (IS_ERR(vaddr)) { + i915_gem_object_put(obj); + return ERR_CAST(vaddr); + } + + i915_gem_object_unpin_map(h->obj); + i915_gem_object_put(h->obj); + + h->obj = obj; + h->batch = vaddr; + } + + vma = i915_vma_instance(h->obj, vm, NULL); + if (IS_ERR(vma)) + return ERR_CAST(vma); + + hws = i915_vma_instance(h->hws, vm, NULL); + if (IS_ERR(hws)) + return ERR_CAST(hws); + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + return ERR_PTR(err); + + err = i915_vma_pin(hws, 0, 0, PIN_USER); + if (err) + goto unpin_vma; + + rq = igt_request_alloc(h->ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto unpin_hws; + } + + err = move_to_active(vma, rq, 0); + if (err) + goto cancel_rq; + + err = move_to_active(hws, rq, 0); + if (err) + goto cancel_rq; + + batch = h->batch; + if (INTEL_GEN(i915) >= 8) { + *batch++ = MI_STORE_DWORD_IMM_GEN4; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = upper_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_ARB_CHECK; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_ARB_CHECK; + *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1; + *batch++ = lower_32_bits(vma->node.start); + *batch++ = upper_32_bits(vma->node.start); + } else if (INTEL_GEN(i915) >= 6) { + *batch++ = MI_STORE_DWORD_IMM_GEN4; + *batch++ = 0; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_ARB_CHECK; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_ARB_CHECK; + *batch++ = MI_BATCH_BUFFER_START | 1 << 8; + *batch++ = lower_32_bits(vma->node.start); + } else if (INTEL_GEN(i915) >= 4) { + *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *batch++ = 0; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_ARB_CHECK; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_ARB_CHECK; + *batch++ = MI_BATCH_BUFFER_START | 2 << 6; + *batch++ = lower_32_bits(vma->node.start); + } else { + *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + *batch++ = lower_32_bits(hws_address(hws, rq)); + *batch++ = rq->fence.seqno; + *batch++ = MI_ARB_CHECK; + + memset(batch, 0, 1024); + batch += 1024 / sizeof(*batch); + + *batch++ = MI_ARB_CHECK; + *batch++ = MI_BATCH_BUFFER_START | 2 << 6; + *batch++ = lower_32_bits(vma->node.start); + } + *batch++ = MI_BATCH_BUFFER_END; /* not reached */ + i915_gem_chipset_flush(h->i915); + + if (rq->engine->emit_init_breadcrumb) { + err = rq->engine->emit_init_breadcrumb(rq); + if (err) + goto cancel_rq; + } + + flags = 0; + if (INTEL_GEN(vm->i915) <= 5) + flags |= I915_DISPATCH_SECURE; + + err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags); + +cancel_rq: + if (err) { + i915_request_skip(rq, err); + i915_request_add(rq); + } +unpin_hws: + i915_vma_unpin(hws); +unpin_vma: + i915_vma_unpin(vma); + return err ? ERR_PTR(err) : rq; +} + +static u32 hws_seqno(const struct hang *h, const struct i915_request *rq) +{ + return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]); +} + +static void hang_fini(struct hang *h) +{ + *h->batch = MI_BATCH_BUFFER_END; + i915_gem_chipset_flush(h->i915); + + i915_gem_object_unpin_map(h->obj); + i915_gem_object_put(h->obj); + + i915_gem_object_unpin_map(h->hws); + i915_gem_object_put(h->hws); + + kernel_context_close(h->ctx); + + igt_flush_test(h->i915, I915_WAIT_LOCKED); +} + +static bool wait_until_running(struct hang *h, struct i915_request *rq) +{ + return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq), + rq->fence.seqno), + 10) && + wait_for(i915_seqno_passed(hws_seqno(h, rq), + rq->fence.seqno), + 1000)); +} + +static int igt_hang_sanitycheck(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_request *rq; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err; + + /* Basic check that we can execute our hanging batch */ + + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + if (err) + goto unlock; + + for_each_engine(engine, i915, id) { + struct igt_wedge_me w; + long timeout; + + if (!intel_engine_can_store_dword(engine)) + continue; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + pr_err("Failed to create request for %s, err=%d\n", + engine->name, err); + goto fini; + } + + i915_request_get(rq); + + *h.batch = MI_BATCH_BUFFER_END; + i915_gem_chipset_flush(i915); + + i915_request_add(rq); + + timeout = 0; + igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) + timeout = i915_request_wait(rq, 0, + MAX_SCHEDULE_TIMEOUT); + if (i915_reset_failed(i915)) + timeout = -EIO; + + i915_request_put(rq); + + if (timeout < 0) { + err = timeout; + pr_err("Wait for request failed on %s, err=%d\n", + engine->name, err); + goto fini; + } + } + +fini: + hang_fini(&h); +unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static bool wait_for_idle(struct intel_engine_cs *engine) +{ + return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0; +} + +static int igt_reset_nop(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_gem_context *ctx; + unsigned int reset_count, count; + enum intel_engine_id id; + intel_wakeref_t wakeref; + struct drm_file *file; + IGT_TIMEOUT(end_time); + int err = 0; + + /* Check that we can reset during non-user portions of requests */ + + file = mock_file(i915); + if (IS_ERR(file)) + return PTR_ERR(file); + + mutex_lock(&i915->drm.struct_mutex); + ctx = live_context(i915, file); + mutex_unlock(&i915->drm.struct_mutex); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out; + } + + i915_gem_context_clear_bannable(ctx); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + reset_count = i915_reset_count(&i915->gpu_error); + count = 0; + do { + mutex_lock(&i915->drm.struct_mutex); + for_each_engine(engine, i915, id) { + int i; + + for (i = 0; i < 16; i++) { + struct i915_request *rq; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + } + } + mutex_unlock(&i915->drm.struct_mutex); + + igt_global_reset_lock(i915); + i915_reset(i915, ALL_ENGINES, NULL); + igt_global_reset_unlock(i915); + if (i915_reset_failed(i915)) { + err = -EIO; + break; + } + + if (i915_reset_count(&i915->gpu_error) != + reset_count + ++count) { + pr_err("Full GPU reset not recorded!\n"); + err = -EINVAL; + break; + } + + err = igt_flush_test(i915, 0); + if (err) + break; + } while (time_before(jiffies, end_time)); + pr_info("%s: %d resets\n", __func__, count); + + mutex_lock(&i915->drm.struct_mutex); + err = igt_flush_test(i915, I915_WAIT_LOCKED); + mutex_unlock(&i915->drm.struct_mutex); + + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + +out: + mock_file_free(i915, file); + if (i915_reset_failed(i915)) + err = -EIO; + return err; +} + +static int igt_reset_nop_engine(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_gem_context *ctx; + enum intel_engine_id id; + intel_wakeref_t wakeref; + struct drm_file *file; + int err = 0; + + /* Check that we can engine-reset during non-user portions */ + + if (!intel_has_reset_engine(i915)) + return 0; + + file = mock_file(i915); + if (IS_ERR(file)) + return PTR_ERR(file); + + mutex_lock(&i915->drm.struct_mutex); + ctx = live_context(i915, file); + mutex_unlock(&i915->drm.struct_mutex); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out; + } + + i915_gem_context_clear_bannable(ctx); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + for_each_engine(engine, i915, id) { + unsigned int reset_count, reset_engine_count; + unsigned int count; + IGT_TIMEOUT(end_time); + + reset_count = i915_reset_count(&i915->gpu_error); + reset_engine_count = i915_reset_engine_count(&i915->gpu_error, + engine); + count = 0; + + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + do { + int i; + + if (!wait_for_idle(engine)) { + pr_err("%s failed to idle before reset\n", + engine->name); + err = -EIO; + break; + } + + mutex_lock(&i915->drm.struct_mutex); + for (i = 0; i < 16; i++) { + struct i915_request *rq; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + break; + } + + i915_request_add(rq); + } + mutex_unlock(&i915->drm.struct_mutex); + + err = i915_reset_engine(engine, NULL); + if (err) { + pr_err("i915_reset_engine failed\n"); + break; + } + + if (i915_reset_count(&i915->gpu_error) != reset_count) { + pr_err("Full GPU reset recorded! (engine reset expected)\n"); + err = -EINVAL; + break; + } + + if (i915_reset_engine_count(&i915->gpu_error, engine) != + reset_engine_count + ++count) { + pr_err("%s engine reset not recorded!\n", + engine->name); + err = -EINVAL; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + pr_info("%s(%s): %d resets\n", __func__, engine->name, count); + + if (err) + break; + + err = igt_flush_test(i915, 0); + if (err) + break; + } + + mutex_lock(&i915->drm.struct_mutex); + err = igt_flush_test(i915, I915_WAIT_LOCKED); + mutex_unlock(&i915->drm.struct_mutex); + + intel_runtime_pm_put(&i915->runtime_pm, wakeref); +out: + mock_file_free(i915, file); + if (i915_reset_failed(i915)) + err = -EIO; + return err; +} + +static int __igt_reset_engine(struct drm_i915_private *i915, bool active) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err = 0; + + /* Check that we can issue an engine reset on an idle engine (no-op) */ + + if (!intel_has_reset_engine(i915)) + return 0; + + if (active) { + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + mutex_unlock(&i915->drm.struct_mutex); + if (err) + return err; + } + + for_each_engine(engine, i915, id) { + unsigned int reset_count, reset_engine_count; + IGT_TIMEOUT(end_time); + + if (active && !intel_engine_can_store_dword(engine)) + continue; + + if (!wait_for_idle(engine)) { + pr_err("%s failed to idle before reset\n", + engine->name); + err = -EIO; + break; + } + + reset_count = i915_reset_count(&i915->gpu_error); + reset_engine_count = i915_reset_engine_count(&i915->gpu_error, + engine); + + intel_engine_pm_get(engine); + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + do { + if (active) { + struct i915_request *rq; + + mutex_lock(&i915->drm.struct_mutex); + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + mutex_unlock(&i915->drm.struct_mutex); + break; + } + + i915_request_get(rq); + i915_request_add(rq); + mutex_unlock(&i915->drm.struct_mutex); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + err = -EIO; + break; + } + + i915_request_put(rq); + } + + err = i915_reset_engine(engine, NULL); + if (err) { + pr_err("i915_reset_engine failed\n"); + break; + } + + if (i915_reset_count(&i915->gpu_error) != reset_count) { + pr_err("Full GPU reset recorded! (engine reset expected)\n"); + err = -EINVAL; + break; + } + + if (i915_reset_engine_count(&i915->gpu_error, engine) != + ++reset_engine_count) { + pr_err("%s engine reset not recorded!\n", + engine->name); + err = -EINVAL; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + intel_engine_pm_put(engine); + + if (err) + break; + + err = igt_flush_test(i915, 0); + if (err) + break; + } + + if (i915_reset_failed(i915)) + err = -EIO; + + if (active) { + mutex_lock(&i915->drm.struct_mutex); + hang_fini(&h); + mutex_unlock(&i915->drm.struct_mutex); + } + + return err; +} + +static int igt_reset_idle_engine(void *arg) +{ + return __igt_reset_engine(arg, false); +} + +static int igt_reset_active_engine(void *arg) +{ + return __igt_reset_engine(arg, true); +} + +struct active_engine { + struct task_struct *task; + struct intel_engine_cs *engine; + unsigned long resets; + unsigned int flags; +}; + +#define TEST_ACTIVE BIT(0) +#define TEST_OTHERS BIT(1) +#define TEST_SELF BIT(2) +#define TEST_PRIORITY BIT(3) + +static int active_request_put(struct i915_request *rq) +{ + int err = 0; + + if (!rq) + return 0; + + if (i915_request_wait(rq, 0, 5 * HZ) < 0) { + GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n", + rq->engine->name, + rq->fence.context, + rq->fence.seqno); + GEM_TRACE_DUMP(); + + i915_gem_set_wedged(rq->i915); + err = -EIO; + } + + i915_request_put(rq); + + return err; +} + +static int active_engine(void *data) +{ + I915_RND_STATE(prng); + struct active_engine *arg = data; + struct intel_engine_cs *engine = arg->engine; + struct i915_request *rq[8] = {}; + struct i915_gem_context *ctx[ARRAY_SIZE(rq)]; + struct drm_file *file; + unsigned long count = 0; + int err = 0; + + file = mock_file(engine->i915); + if (IS_ERR(file)) + return PTR_ERR(file); + + for (count = 0; count < ARRAY_SIZE(ctx); count++) { + mutex_lock(&engine->i915->drm.struct_mutex); + ctx[count] = live_context(engine->i915, file); + mutex_unlock(&engine->i915->drm.struct_mutex); + if (IS_ERR(ctx[count])) { + err = PTR_ERR(ctx[count]); + while (--count) + i915_gem_context_put(ctx[count]); + goto err_file; + } + } + + while (!kthread_should_stop()) { + unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1); + struct i915_request *old = rq[idx]; + struct i915_request *new; + + mutex_lock(&engine->i915->drm.struct_mutex); + new = igt_request_alloc(ctx[idx], engine); + if (IS_ERR(new)) { + mutex_unlock(&engine->i915->drm.struct_mutex); + err = PTR_ERR(new); + break; + } + + if (arg->flags & TEST_PRIORITY) + ctx[idx]->sched.priority = + i915_prandom_u32_max_state(512, &prng); + + rq[idx] = i915_request_get(new); + i915_request_add(new); + mutex_unlock(&engine->i915->drm.struct_mutex); + + err = active_request_put(old); + if (err) + break; + + cond_resched(); + } + + for (count = 0; count < ARRAY_SIZE(rq); count++) { + int err__ = active_request_put(rq[count]); + + /* Keep the first error */ + if (!err) + err = err__; + } + +err_file: + mock_file_free(engine->i915, file); + return err; +} + +static int __igt_reset_engines(struct drm_i915_private *i915, + const char *test_name, + unsigned int flags) +{ + struct intel_engine_cs *engine, *other; + enum intel_engine_id id, tmp; + struct hang h; + int err = 0; + + /* Check that issuing a reset on one engine does not interfere + * with any other engine. + */ + + if (!intel_has_reset_engine(i915)) + return 0; + + if (flags & TEST_ACTIVE) { + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + mutex_unlock(&i915->drm.struct_mutex); + if (err) + return err; + + if (flags & TEST_PRIORITY) + h.ctx->sched.priority = 1024; + } + + for_each_engine(engine, i915, id) { + struct active_engine threads[I915_NUM_ENGINES] = {}; + unsigned long global = i915_reset_count(&i915->gpu_error); + unsigned long count = 0, reported; + IGT_TIMEOUT(end_time); + + if (flags & TEST_ACTIVE && + !intel_engine_can_store_dword(engine)) + continue; + + if (!wait_for_idle(engine)) { + pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n", + engine->name, test_name); + err = -EIO; + break; + } + + memset(threads, 0, sizeof(threads)); + for_each_engine(other, i915, tmp) { + struct task_struct *tsk; + + threads[tmp].resets = + i915_reset_engine_count(&i915->gpu_error, + other); + + if (!(flags & TEST_OTHERS)) + continue; + + if (other == engine && !(flags & TEST_SELF)) + continue; + + threads[tmp].engine = other; + threads[tmp].flags = flags; + + tsk = kthread_run(active_engine, &threads[tmp], + "igt/%s", other->name); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + goto unwind; + } + + threads[tmp].task = tsk; + get_task_struct(tsk); + } + + intel_engine_pm_get(engine); + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + do { + struct i915_request *rq = NULL; + + if (flags & TEST_ACTIVE) { + mutex_lock(&i915->drm.struct_mutex); + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + mutex_unlock(&i915->drm.struct_mutex); + break; + } + + i915_request_get(rq); + i915_request_add(rq); + mutex_unlock(&i915->drm.struct_mutex); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + err = -EIO; + break; + } + } + + err = i915_reset_engine(engine, NULL); + if (err) { + pr_err("i915_reset_engine(%s:%s): failed, err=%d\n", + engine->name, test_name, err); + break; + } + + count++; + + if (rq) { + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("i915_reset_engine(%s:%s):" + " failed to complete request after reset\n", + engine->name, test_name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + i915_request_put(rq); + + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + break; + } + + i915_request_put(rq); + } + + if (!(flags & TEST_SELF) && !wait_for_idle(engine)) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("i915_reset_engine(%s:%s):" + " failed to idle after reset\n", + engine->name, test_name); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + err = -EIO; + break; + } + } while (time_before(jiffies, end_time)); + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + intel_engine_pm_put(engine); + pr_info("i915_reset_engine(%s:%s): %lu resets\n", + engine->name, test_name, count); + + reported = i915_reset_engine_count(&i915->gpu_error, engine); + reported -= threads[engine->id].resets; + if (reported != count) { + pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n", + engine->name, test_name, count, reported); + if (!err) + err = -EINVAL; + } + +unwind: + for_each_engine(other, i915, tmp) { + int ret; + + if (!threads[tmp].task) + continue; + + ret = kthread_stop(threads[tmp].task); + if (ret) { + pr_err("kthread for other engine %s failed, err=%d\n", + other->name, ret); + if (!err) + err = ret; + } + put_task_struct(threads[tmp].task); + + if (other != engine && + threads[tmp].resets != + i915_reset_engine_count(&i915->gpu_error, other)) { + pr_err("Innocent engine %s was reset (count=%ld)\n", + other->name, + i915_reset_engine_count(&i915->gpu_error, + other) - + threads[tmp].resets); + if (!err) + err = -EINVAL; + } + } + + if (global != i915_reset_count(&i915->gpu_error)) { + pr_err("Global reset (count=%ld)!\n", + i915_reset_count(&i915->gpu_error) - global); + if (!err) + err = -EINVAL; + } + + if (err) + break; + + mutex_lock(&i915->drm.struct_mutex); + err = igt_flush_test(i915, I915_WAIT_LOCKED); + mutex_unlock(&i915->drm.struct_mutex); + if (err) + break; + } + + if (i915_reset_failed(i915)) + err = -EIO; + + if (flags & TEST_ACTIVE) { + mutex_lock(&i915->drm.struct_mutex); + hang_fini(&h); + mutex_unlock(&i915->drm.struct_mutex); + } + + return err; +} + +static int igt_reset_engines(void *arg) +{ + static const struct { + const char *name; + unsigned int flags; + } phases[] = { + { "idle", 0 }, + { "active", TEST_ACTIVE }, + { "others-idle", TEST_OTHERS }, + { "others-active", TEST_OTHERS | TEST_ACTIVE }, + { + "others-priority", + TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY + }, + { + "self-priority", + TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF, + }, + { } + }; + struct drm_i915_private *i915 = arg; + typeof(*phases) *p; + int err; + + for (p = phases; p->name; p++) { + if (p->flags & TEST_PRIORITY) { + if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY)) + continue; + } + + err = __igt_reset_engines(arg, p->name, p->flags); + if (err) + return err; + } + + return 0; +} + +static u32 fake_hangcheck(struct drm_i915_private *i915, + intel_engine_mask_t mask) +{ + u32 count = i915_reset_count(&i915->gpu_error); + + i915_reset(i915, mask, NULL); + + return count; +} + +static int igt_reset_wait(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_request *rq; + unsigned int reset_count; + struct hang h; + long timeout; + int err; + + if (!intel_engine_can_store_dword(i915->engine[RCS0])) + return 0; + + /* Check that we detect a stuck waiter and issue a reset */ + + igt_global_reset_lock(i915); + + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + if (err) + goto unlock; + + rq = hang_create_request(&h, i915->engine[RCS0]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + + err = -EIO; + goto out_rq; + } + + reset_count = fake_hangcheck(i915, ALL_ENGINES); + + timeout = i915_request_wait(rq, 0, 10); + if (timeout < 0) { + pr_err("i915_request_wait failed on a stuck request: err=%ld\n", + timeout); + err = timeout; + goto out_rq; + } + + if (i915_reset_count(&i915->gpu_error) == reset_count) { + pr_err("No GPU reset recorded!\n"); + err = -EINVAL; + goto out_rq; + } + +out_rq: + i915_request_put(rq); +fini: + hang_fini(&h); +unlock: + mutex_unlock(&i915->drm.struct_mutex); + igt_global_reset_unlock(i915); + + if (i915_reset_failed(i915)) + return -EIO; + + return err; +} + +struct evict_vma { + struct completion completion; + struct i915_vma *vma; +}; + +static int evict_vma(void *data) +{ + struct evict_vma *arg = data; + struct i915_address_space *vm = arg->vma->vm; + struct drm_i915_private *i915 = vm->i915; + struct drm_mm_node evict = arg->vma->node; + int err; + + complete(&arg->completion); + + mutex_lock(&i915->drm.struct_mutex); + err = i915_gem_evict_for_node(vm, &evict, 0); + mutex_unlock(&i915->drm.struct_mutex); + + return err; +} + +static int evict_fence(void *data) +{ + struct evict_vma *arg = data; + struct drm_i915_private *i915 = arg->vma->vm->i915; + int err; + + complete(&arg->completion); + + mutex_lock(&i915->drm.struct_mutex); + + /* Mark the fence register as dirty to force the mmio update. */ + err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512); + if (err) { + pr_err("Invalid Y-tiling settings; err:%d\n", err); + goto out_unlock; + } + + err = i915_vma_pin_fence(arg->vma); + if (err) { + pr_err("Unable to pin Y-tiled fence; err:%d\n", err); + goto out_unlock; + } + + i915_vma_unpin_fence(arg->vma); + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + + return err; +} + +static int __igt_reset_evict_vma(struct drm_i915_private *i915, + struct i915_address_space *vm, + int (*fn)(void *), + unsigned int flags) +{ + struct drm_i915_gem_object *obj; + struct task_struct *tsk = NULL; + struct i915_request *rq; + struct evict_vma arg; + struct hang h; + int err; + + if (!intel_engine_can_store_dword(i915->engine[RCS0])) + return 0; + + /* Check that we can recover an unbind stuck on a hanging request */ + + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + if (err) + goto unlock; + + obj = i915_gem_object_create_internal(i915, SZ_1M); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto fini; + } + + if (flags & EXEC_OBJECT_NEEDS_FENCE) { + err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512); + if (err) { + pr_err("Invalid X-tiling settings; err:%d\n", err); + goto out_obj; + } + } + + arg.vma = i915_vma_instance(obj, vm, NULL); + if (IS_ERR(arg.vma)) { + err = PTR_ERR(arg.vma); + goto out_obj; + } + + rq = hang_create_request(&h, i915->engine[RCS0]); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_obj; + } + + err = i915_vma_pin(arg.vma, 0, 0, + i915_vma_is_ggtt(arg.vma) ? + PIN_GLOBAL | PIN_MAPPABLE : + PIN_USER); + if (err) { + i915_request_add(rq); + goto out_obj; + } + + if (flags & EXEC_OBJECT_NEEDS_FENCE) { + err = i915_vma_pin_fence(arg.vma); + if (err) { + pr_err("Unable to pin X-tiled fence; err:%d\n", err); + i915_vma_unpin(arg.vma); + i915_request_add(rq); + goto out_obj; + } + } + + i915_vma_lock(arg.vma); + err = i915_vma_move_to_active(arg.vma, rq, flags); + i915_vma_unlock(arg.vma); + + if (flags & EXEC_OBJECT_NEEDS_FENCE) + i915_vma_unpin_fence(arg.vma); + i915_vma_unpin(arg.vma); + + i915_request_get(rq); + i915_request_add(rq); + if (err) + goto out_rq; + + mutex_unlock(&i915->drm.struct_mutex); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + goto out_reset; + } + + init_completion(&arg.completion); + + tsk = kthread_run(fn, &arg, "igt/evict_vma"); + if (IS_ERR(tsk)) { + err = PTR_ERR(tsk); + tsk = NULL; + goto out_reset; + } + get_task_struct(tsk); + + wait_for_completion(&arg.completion); + + if (wait_for(!list_empty(&rq->fence.cb_list), 10)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("igt/evict_vma kthread did not wait\n"); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + goto out_reset; + } + +out_reset: + igt_global_reset_lock(i915); + fake_hangcheck(rq->i915, rq->engine->mask); + igt_global_reset_unlock(i915); + + if (tsk) { + struct igt_wedge_me w; + + /* The reset, even indirectly, should take less than 10ms. */ + igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/) + err = kthread_stop(tsk); + + put_task_struct(tsk); + } + + mutex_lock(&i915->drm.struct_mutex); +out_rq: + i915_request_put(rq); +out_obj: + i915_gem_object_put(obj); +fini: + hang_fini(&h); +unlock: + mutex_unlock(&i915->drm.struct_mutex); + + if (i915_reset_failed(i915)) + return -EIO; + + return err; +} + +static int igt_reset_evict_ggtt(void *arg) +{ + struct drm_i915_private *i915 = arg; + + return __igt_reset_evict_vma(i915, &i915->ggtt.vm, + evict_vma, EXEC_OBJECT_WRITE); +} + +static int igt_reset_evict_ppgtt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx; + struct drm_file *file; + int err; + + file = mock_file(i915); + if (IS_ERR(file)) + return PTR_ERR(file); + + mutex_lock(&i915->drm.struct_mutex); + ctx = live_context(i915, file); + mutex_unlock(&i915->drm.struct_mutex); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out; + } + + err = 0; + if (ctx->vm) /* aliasing == global gtt locking, covered above */ + err = __igt_reset_evict_vma(i915, ctx->vm, + evict_vma, EXEC_OBJECT_WRITE); + +out: + mock_file_free(i915, file); + return err; +} + +static int igt_reset_evict_fence(void *arg) +{ + struct drm_i915_private *i915 = arg; + + return __igt_reset_evict_vma(i915, &i915->ggtt.vm, + evict_fence, EXEC_OBJECT_NEEDS_FENCE); +} + +static int wait_for_others(struct drm_i915_private *i915, + struct intel_engine_cs *exclude) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + if (engine == exclude) + continue; + + if (!wait_for_idle(engine)) + return -EIO; + } + + return 0; +} + +static int igt_reset_queue(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct hang h; + int err; + + /* Check that we replay pending requests following a hang */ + + igt_global_reset_lock(i915); + + mutex_lock(&i915->drm.struct_mutex); + err = hang_init(&h, i915); + if (err) + goto unlock; + + for_each_engine(engine, i915, id) { + struct i915_request *prev; + IGT_TIMEOUT(end_time); + unsigned int count; + + if (!intel_engine_can_store_dword(engine)) + continue; + + prev = hang_create_request(&h, engine); + if (IS_ERR(prev)) { + err = PTR_ERR(prev); + goto fini; + } + + i915_request_get(prev); + i915_request_add(prev); + + count = 0; + do { + struct i915_request *rq; + unsigned int reset_count; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + /* + * XXX We don't handle resetting the kernel context + * very well. If we trigger a device reset twice in + * quick succession while the kernel context is + * executing, we may end up skipping the breadcrumb. + * This is really only a problem for the selftest as + * normally there is a large interlude between resets + * (hangcheck), or we focus on resetting just one + * engine and so avoid repeatedly resetting innocents. + */ + err = wait_for_others(i915, engine); + if (err) { + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", + __func__, engine->name); + i915_request_put(rq); + i915_request_put(prev); + + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + goto fini; + } + + if (!wait_until_running(&h, prev)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s(%s): Failed to start request %llx, at %x\n", + __func__, engine->name, + prev->fence.seqno, hws_seqno(&h, prev)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + + i915_request_put(rq); + i915_request_put(prev); + + i915_gem_set_wedged(i915); + + err = -EIO; + goto fini; + } + + reset_count = fake_hangcheck(i915, BIT(id)); + + if (prev->fence.error != -EIO) { + pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", + prev->fence.error); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + if (rq->fence.error) { + pr_err("Fence error status not zero [%d] after unrelated reset\n", + rq->fence.error); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + if (i915_reset_count(&i915->gpu_error) == reset_count) { + pr_err("No GPU reset recorded!\n"); + i915_request_put(rq); + i915_request_put(prev); + err = -EINVAL; + goto fini; + } + + i915_request_put(prev); + prev = rq; + count++; + } while (time_before(jiffies, end_time)); + pr_info("%s: Completed %d resets\n", engine->name, count); + + *h.batch = MI_BATCH_BUFFER_END; + i915_gem_chipset_flush(i915); + + i915_request_put(prev); + + err = igt_flush_test(i915, I915_WAIT_LOCKED); + if (err) + break; + } + +fini: + hang_fini(&h); +unlock: + mutex_unlock(&i915->drm.struct_mutex); + igt_global_reset_unlock(i915); + + if (i915_reset_failed(i915)) + return -EIO; + + return err; +} + +static int igt_handle_error(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine = i915->engine[RCS0]; + struct hang h; + struct i915_request *rq; + struct i915_gpu_state *error; + int err; + + /* Check that we can issue a global GPU and engine reset */ + + if (!intel_has_reset_engine(i915)) + return 0; + + if (!engine || !intel_engine_can_store_dword(engine)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + err = hang_init(&h, i915); + if (err) + goto err_unlock; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_fini; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!wait_until_running(&h, rq)) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to start request %llx, at %x\n", + __func__, rq->fence.seqno, hws_seqno(&h, rq)); + intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name); + + i915_gem_set_wedged(i915); + + err = -EIO; + goto err_request; + } + + mutex_unlock(&i915->drm.struct_mutex); + + /* Temporarily disable error capture */ + error = xchg(&i915->gpu_error.first_error, (void *)-1); + + i915_handle_error(i915, engine->mask, 0, NULL); + + xchg(&i915->gpu_error.first_error, error); + + mutex_lock(&i915->drm.struct_mutex); + + if (rq->fence.error != -EIO) { + pr_err("Guilty request not identified!\n"); + err = -EINVAL; + goto err_request; + } + +err_request: + i915_request_put(rq); +err_fini: + hang_fini(&h); +err_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int __igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct igt_atomic_section *p, + const char *mode) +{ + struct tasklet_struct * const t = &engine->execlists.tasklet; + int err; + + GEM_TRACE("i915_reset_engine(%s:%s) under %s\n", + engine->name, mode, p->name); + + tasklet_disable_nosync(t); + p->critical_section_begin(); + + err = i915_reset_engine(engine, NULL); + + p->critical_section_end(); + tasklet_enable(t); + + if (err) + pr_err("i915_reset_engine(%s:%s) failed under %s\n", + engine->name, mode, p->name); + + return err; +} + +static int igt_atomic_reset_engine(struct intel_engine_cs *engine, + const struct igt_atomic_section *p) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_request *rq; + struct hang h; + int err; + + err = __igt_atomic_reset_engine(engine, p, "idle"); + if (err) + return err; + + err = hang_init(&h, i915); + if (err) + return err; + + rq = hang_create_request(&h, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (wait_until_running(&h, rq)) { + err = __igt_atomic_reset_engine(engine, p, "active"); + } else { + pr_err("%s(%s): Failed to start request %llx, at %x\n", + __func__, engine->name, + rq->fence.seqno, hws_seqno(&h, rq)); + i915_gem_set_wedged(i915); + err = -EIO; + } + + if (err == 0) { + struct igt_wedge_me w; + + igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/) + i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT); + if (i915_reset_failed(i915)) + err = -EIO; + } + + i915_request_put(rq); +out: + hang_fini(&h); + return err; +} + +static int igt_reset_engines_atomic(void *arg) +{ + struct drm_i915_private *i915 = arg; + const typeof(*igt_atomic_phases) *p; + int err = 0; + + /* Check that the engines resets are usable from atomic context */ + + if (!intel_has_reset_engine(i915)) + return 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + igt_global_reset_lock(i915); + mutex_lock(&i915->drm.struct_mutex); + + /* Flush any requests before we get started and check basics */ + if (!igt_force_reset(i915)) + goto unlock; + + for (p = igt_atomic_phases; p->name; p++) { + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + err = igt_atomic_reset_engine(engine, p); + if (err) + goto out; + } + } + +out: + /* As we poke around the guts, do a full reset before continuing. */ + igt_force_reset(i915); + +unlock: + mutex_unlock(&i915->drm.struct_mutex); + igt_global_reset_unlock(i915); + + return err; +} + +int intel_hangcheck_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_hang_sanitycheck), + SUBTEST(igt_reset_nop), + SUBTEST(igt_reset_nop_engine), + SUBTEST(igt_reset_idle_engine), + SUBTEST(igt_reset_active_engine), + SUBTEST(igt_reset_engines), + SUBTEST(igt_reset_engines_atomic), + SUBTEST(igt_reset_queue), + SUBTEST(igt_reset_wait), + SUBTEST(igt_reset_evict_ggtt), + SUBTEST(igt_reset_evict_ppgtt), + SUBTEST(igt_reset_evict_fence), + SUBTEST(igt_handle_error), + }; + intel_wakeref_t wakeref; + bool saved_hangcheck; + int err; + + if (!intel_has_gpu_reset(i915)) + return 0; + + if (i915_terminally_wedged(i915)) + return -EIO; /* we're long past hope of a successful reset */ + + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck); + drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */ + + err = i915_subtests(tests, i915); + + mutex_lock(&i915->drm.struct_mutex); + igt_flush_test(i915, I915_WAIT_LOCKED); + mutex_unlock(&i915->drm.struct_mutex); + + i915_modparams.enable_hangcheck = saved_hangcheck; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c new file mode 100644 index 000000000000..401e8b539297 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -0,0 +1,1835 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include <linux/prime_numbers.h> + +#include "gem/i915_gem_pm.h" +#include "gt/intel_reset.h" + +#include "i915_selftest.h" +#include "selftests/i915_random.h" +#include "selftests/igt_flush_test.h" +#include "selftests/igt_live_test.h" +#include "selftests/igt_spinner.h" +#include "selftests/lib_sw_fence.h" + +#include "gem/selftests/igt_gem_utils.h" +#include "gem/selftests/mock_context.h" + +static int live_sanitycheck(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_gem_context *ctx; + enum intel_engine_id id; + struct igt_spinner spin; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_CONTEXTS(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (igt_spinner_init(&spin, i915)) + goto err_unlock; + + ctx = kernel_context(i915); + if (!ctx) + goto err_spin; + + for_each_engine(engine, i915, id) { + struct i915_request *rq; + + rq = igt_spinner_create_request(&spin, ctx, engine, MI_NOOP); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin, rq)) { + GEM_TRACE("spinner failed to start\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx; + } + + igt_spinner_end(&spin); + if (igt_flush_test(i915, I915_WAIT_LOCKED)) { + err = -EIO; + goto err_ctx; + } + } + + err = 0; +err_ctx: + kernel_context_close(ctx); +err_spin: + igt_spinner_fini(&spin); +err_unlock: + igt_flush_test(i915, I915_WAIT_LOCKED); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int live_busywait_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct intel_engine_cs *engine; + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + u32 *map; + + /* + * Verify that even without HAS_LOGICAL_RING_PREEMPTION, we can + * preempt the busywaits used to synchronise between rings. + */ + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + ctx_hi = kernel_context(i915); + if (!ctx_hi) + goto err_unlock; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + obj = i915_gem_object_create_internal(i915, PAGE_SIZE); + if (IS_ERR(obj)) { + err = PTR_ERR(obj); + goto err_ctx_lo; + } + + map = i915_gem_object_pin_map(obj, I915_MAP_WC); + if (IS_ERR(map)) { + err = PTR_ERR(map); + goto err_obj; + } + + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_map; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_map; + + for_each_engine(engine, i915, id) { + struct i915_request *lo, *hi; + struct igt_live_test t; + u32 *cs; + + if (!intel_engine_can_store_dword(engine)) + continue; + + if (igt_live_test_begin(&t, i915, __func__, engine->name)) { + err = -EIO; + goto err_vma; + } + + /* + * We create two requests. The low priority request + * busywaits on a semaphore (inside the ringbuffer where + * is should be preemptible) and the high priority requests + * uses a MI_STORE_DWORD_IMM to update the semaphore value + * allowing the first request to complete. If preemption + * fails, we hang instead. + */ + + lo = igt_request_alloc(ctx_lo, engine); + if (IS_ERR(lo)) { + err = PTR_ERR(lo); + goto err_vma; + } + + cs = intel_ring_begin(lo, 8); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(lo); + goto err_vma; + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + *cs++ = 1; + + /* XXX Do we need a flush + invalidate here? */ + + *cs++ = MI_SEMAPHORE_WAIT | + MI_SEMAPHORE_GLOBAL_GTT | + MI_SEMAPHORE_POLL | + MI_SEMAPHORE_SAD_EQ_SDD; + *cs++ = 0; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + + intel_ring_advance(lo, cs); + i915_request_add(lo); + + if (wait_for(READ_ONCE(*map), 10)) { + err = -ETIMEDOUT; + goto err_vma; + } + + /* Low priority request should be busywaiting now */ + if (i915_request_wait(lo, 0, 1) != -ETIME) { + pr_err("%s: Busywaiting request did not!\n", + engine->name); + err = -EIO; + goto err_vma; + } + + hi = igt_request_alloc(ctx_hi, engine); + if (IS_ERR(hi)) { + err = PTR_ERR(hi); + goto err_vma; + } + + cs = intel_ring_begin(hi, 4); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + i915_request_add(hi); + goto err_vma; + } + + *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT; + *cs++ = i915_ggtt_offset(vma); + *cs++ = 0; + *cs++ = 0; + + intel_ring_advance(hi, cs); + i915_request_add(hi); + + if (i915_request_wait(lo, 0, HZ / 5) < 0) { + struct drm_printer p = drm_info_printer(i915->drm.dev); + + pr_err("%s: Failed to preempt semaphore busywait!\n", + engine->name); + + intel_engine_dump(engine, &p, "%s\n", engine->name); + GEM_TRACE_DUMP(); + + i915_gem_set_wedged(i915); + err = -EIO; + goto err_vma; + } + GEM_BUG_ON(READ_ONCE(*map)); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_vma; + } + } + + err = 0; +err_vma: + i915_vma_unpin(vma); +err_map: + i915_gem_object_unpin_map(obj); +err_obj: + i915_gem_object_put(obj); +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_unlock: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int live_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_hi, spin_lo; + struct intel_engine_cs *engine; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PREEMPTION)) + pr_err("Logical preemption supported, but not exposed\n"); + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (igt_spinner_init(&spin_hi, i915)) + goto err_unlock; + + if (igt_spinner_init(&spin_lo, i915)) + goto err_spin_hi; + + ctx_hi = kernel_context(i915); + if (!ctx_hi) + goto err_spin_lo; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + for_each_engine(engine, i915, id) { + struct igt_live_test t; + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_live_test_begin(&t, i915, __func__, engine->name)) { + err = -EIO; + goto err_ctx_lo; + } + + rq = igt_spinner_create_request(&spin_lo, ctx_lo, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + GEM_TRACE("lo spinner failed to start\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; + } + + rq = igt_spinner_create_request(&spin_hi, ctx_hi, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_hi, rq)) { + GEM_TRACE("hi spinner failed to start\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; + } + + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_ctx_lo; + } + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); +err_spin_hi: + igt_spinner_fini(&spin_hi); +err_unlock: + igt_flush_test(i915, I915_WAIT_LOCKED); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int live_late_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_hi, spin_lo; + struct intel_engine_cs *engine; + struct i915_sched_attr attr = {}; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (igt_spinner_init(&spin_hi, i915)) + goto err_unlock; + + if (igt_spinner_init(&spin_lo, i915)) + goto err_spin_hi; + + ctx_hi = kernel_context(i915); + if (!ctx_hi) + goto err_spin_lo; + + ctx_lo = kernel_context(i915); + if (!ctx_lo) + goto err_ctx_hi; + + for_each_engine(engine, i915, id) { + struct igt_live_test t; + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (igt_live_test_begin(&t, i915, __func__, engine->name)) { + err = -EIO; + goto err_ctx_lo; + } + + rq = igt_spinner_create_request(&spin_lo, ctx_lo, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + pr_err("First context failed to start\n"); + goto err_wedged; + } + + rq = igt_spinner_create_request(&spin_hi, ctx_hi, engine, + MI_NOOP); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (igt_wait_for_spinner(&spin_hi, rq)) { + pr_err("Second context overtook first?\n"); + goto err_wedged; + } + + attr.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX); + engine->schedule(rq, &attr); + + if (!igt_wait_for_spinner(&spin_hi, rq)) { + pr_err("High priority context failed to preempt the low priority context\n"); + GEM_TRACE_DUMP(); + goto err_wedged; + } + + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_ctx_lo; + } + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); +err_spin_hi: + igt_spinner_fini(&spin_hi); +err_unlock: + igt_flush_test(i915, I915_WAIT_LOCKED); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; + +err_wedged: + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; +} + +struct preempt_client { + struct igt_spinner spin; + struct i915_gem_context *ctx; +}; + +static int preempt_client_init(struct drm_i915_private *i915, + struct preempt_client *c) +{ + c->ctx = kernel_context(i915); + if (!c->ctx) + return -ENOMEM; + + if (igt_spinner_init(&c->spin, i915)) + goto err_ctx; + + return 0; + +err_ctx: + kernel_context_close(c->ctx); + return -ENOMEM; +} + +static void preempt_client_fini(struct preempt_client *c) +{ + igt_spinner_fini(&c->spin); + kernel_context_close(c->ctx); +} + +static int live_suppress_self_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX) + }; + struct preempt_client a, b; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + /* + * Verify that if a preemption request does not cause a change in + * the current execution order, the preempt-to-idle injection is + * skipped and that we do not accidentally apply it after the CS + * completion event. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; /* presume black blox */ + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (preempt_client_init(i915, &a)) + goto err_unlock; + if (preempt_client_init(i915, &b)) + goto err_client_a; + + for_each_engine(engine, i915, id) { + struct i915_request *rq_a, *rq_b; + int depth; + + if (!intel_engine_has_preemption(engine)) + continue; + + engine->execlists.preempt_hang.count = 0; + + rq_a = igt_spinner_create_request(&a.spin, + a.ctx, engine, + MI_NOOP); + if (IS_ERR(rq_a)) { + err = PTR_ERR(rq_a); + goto err_client_b; + } + + i915_request_add(rq_a); + if (!igt_wait_for_spinner(&a.spin, rq_a)) { + pr_err("First client failed to start\n"); + goto err_wedged; + } + + for (depth = 0; depth < 8; depth++) { + rq_b = igt_spinner_create_request(&b.spin, + b.ctx, engine, + MI_NOOP); + if (IS_ERR(rq_b)) { + err = PTR_ERR(rq_b); + goto err_client_b; + } + i915_request_add(rq_b); + + GEM_BUG_ON(i915_request_completed(rq_a)); + engine->schedule(rq_a, &attr); + igt_spinner_end(&a.spin); + + if (!igt_wait_for_spinner(&b.spin, rq_b)) { + pr_err("Second client failed to start\n"); + goto err_wedged; + } + + swap(a, b); + rq_a = rq_b; + } + igt_spinner_end(&a.spin); + + if (engine->execlists.preempt_hang.count) { + pr_err("Preemption recorded x%d, depth %d; should have been suppressed!\n", + engine->execlists.preempt_hang.count, + depth); + err = -EINVAL; + goto err_client_b; + } + + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + goto err_wedged; + } + + err = 0; +err_client_b: + preempt_client_fini(&b); +err_client_a: + preempt_client_fini(&a); +err_unlock: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; + +err_wedged: + igt_spinner_end(&b.spin); + igt_spinner_end(&a.spin); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_client_b; +} + +static int __i915_sw_fence_call +dummy_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) +{ + return NOTIFY_DONE; +} + +static struct i915_request *dummy_request(struct intel_engine_cs *engine) +{ + struct i915_request *rq; + + rq = kzalloc(sizeof(*rq), GFP_KERNEL); + if (!rq) + return NULL; + + INIT_LIST_HEAD(&rq->active_list); + rq->engine = engine; + + i915_sched_node_init(&rq->sched); + + /* mark this request as permanently incomplete */ + rq->fence.seqno = 1; + BUILD_BUG_ON(sizeof(rq->fence.seqno) != 8); /* upper 32b == 0 */ + rq->hwsp_seqno = (u32 *)&rq->fence.seqno + 1; + GEM_BUG_ON(i915_request_completed(rq)); + + i915_sw_fence_init(&rq->submit, dummy_notify); + set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags); + + return rq; +} + +static void dummy_request_free(struct i915_request *dummy) +{ + /* We have to fake the CS interrupt to kick the next request */ + i915_sw_fence_commit(&dummy->submit); + + i915_request_mark_complete(dummy); + dma_fence_signal(&dummy->fence); + + i915_sched_node_fini(&dummy->sched); + i915_sw_fence_fini(&dummy->submit); + + dma_fence_free(&dummy->fence); +} + +static int live_suppress_wait_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct preempt_client client[4]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + int i; + + /* + * Waiters are given a little priority nudge, but not enough + * to actually cause any preemption. Double check that we do + * not needlessly generate preempt-to-idle cycles. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (preempt_client_init(i915, &client[0])) /* ELSP[0] */ + goto err_unlock; + if (preempt_client_init(i915, &client[1])) /* ELSP[1] */ + goto err_client_0; + if (preempt_client_init(i915, &client[2])) /* head of queue */ + goto err_client_1; + if (preempt_client_init(i915, &client[3])) /* bystander */ + goto err_client_2; + + for_each_engine(engine, i915, id) { + int depth; + + if (!intel_engine_has_preemption(engine)) + continue; + + if (!engine->emit_init_breadcrumb) + continue; + + for (depth = 0; depth < ARRAY_SIZE(client); depth++) { + struct i915_request *rq[ARRAY_SIZE(client)]; + struct i915_request *dummy; + + engine->execlists.preempt_hang.count = 0; + + dummy = dummy_request(engine); + if (!dummy) + goto err_client_3; + + for (i = 0; i < ARRAY_SIZE(client); i++) { + rq[i] = igt_spinner_create_request(&client[i].spin, + client[i].ctx, engine, + MI_NOOP); + if (IS_ERR(rq[i])) { + err = PTR_ERR(rq[i]); + goto err_wedged; + } + + /* Disable NEWCLIENT promotion */ + __i915_active_request_set(&rq[i]->timeline->last_request, + dummy); + i915_request_add(rq[i]); + } + + dummy_request_free(dummy); + + GEM_BUG_ON(i915_request_completed(rq[0])); + if (!igt_wait_for_spinner(&client[0].spin, rq[0])) { + pr_err("%s: First client failed to start\n", + engine->name); + goto err_wedged; + } + GEM_BUG_ON(!i915_request_started(rq[0])); + + if (i915_request_wait(rq[depth], + I915_WAIT_PRIORITY, + 1) != -ETIME) { + pr_err("%s: Waiter depth:%d completed!\n", + engine->name, depth); + goto err_wedged; + } + + for (i = 0; i < ARRAY_SIZE(client); i++) + igt_spinner_end(&client[i].spin); + + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + goto err_wedged; + + if (engine->execlists.preempt_hang.count) { + pr_err("%s: Preemption recorded x%d, depth %d; should have been suppressed!\n", + engine->name, + engine->execlists.preempt_hang.count, + depth); + err = -EINVAL; + goto err_client_3; + } + } + } + + err = 0; +err_client_3: + preempt_client_fini(&client[3]); +err_client_2: + preempt_client_fini(&client[2]); +err_client_1: + preempt_client_fini(&client[1]); +err_client_0: + preempt_client_fini(&client[0]); +err_unlock: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; + +err_wedged: + for (i = 0; i < ARRAY_SIZE(client); i++) + igt_spinner_end(&client[i].spin); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_client_3; +} + +static int live_chain_preempt(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct preempt_client hi, lo; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + /* + * Build a chain AB...BA between two contexts (A, B) and request + * preemption of the last request. It should then complete before + * the previously submitted spinner in B. + */ + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (preempt_client_init(i915, &hi)) + goto err_unlock; + + if (preempt_client_init(i915, &lo)) + goto err_client_hi; + + for_each_engine(engine, i915, id) { + struct i915_sched_attr attr = { + .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), + }; + struct igt_live_test t; + struct i915_request *rq; + int ring_size, count, i; + + if (!intel_engine_has_preemption(engine)) + continue; + + rq = igt_spinner_create_request(&lo.spin, + lo.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + + ring_size = rq->wa_tail - rq->head; + if (ring_size < 0) + ring_size += rq->ring->size; + ring_size = rq->ring->size / ring_size; + pr_debug("%s(%s): Using maximum of %d requests\n", + __func__, engine->name, ring_size); + + igt_spinner_end(&lo.spin); + if (i915_request_wait(rq, 0, HZ / 2) < 0) { + pr_err("Timed out waiting to flush %s\n", engine->name); + goto err_wedged; + } + + if (igt_live_test_begin(&t, i915, __func__, engine->name)) { + err = -EIO; + goto err_wedged; + } + + for_each_prime_number_from(count, 1, ring_size) { + rq = igt_spinner_create_request(&hi.spin, + hi.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + if (!igt_wait_for_spinner(&hi.spin, rq)) + goto err_wedged; + + rq = igt_spinner_create_request(&lo.spin, + lo.ctx, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + + for (i = 0; i < count; i++) { + rq = igt_request_alloc(lo.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + } + + rq = igt_request_alloc(hi.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + engine->schedule(rq, &attr); + + igt_spinner_end(&hi.spin); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("Failed to preempt over chain of %d\n", + count); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + goto err_wedged; + } + igt_spinner_end(&lo.spin); + + rq = igt_request_alloc(lo.ctx, engine); + if (IS_ERR(rq)) + goto err_wedged; + i915_request_add(rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + struct drm_printer p = + drm_info_printer(i915->drm.dev); + + pr_err("Failed to flush low priority chain of %d requests\n", + count); + intel_engine_dump(engine, &p, + "%s\n", engine->name); + goto err_wedged; + } + } + + if (igt_live_test_end(&t)) { + err = -EIO; + goto err_wedged; + } + } + + err = 0; +err_client_lo: + preempt_client_fini(&lo); +err_client_hi: + preempt_client_fini(&hi); +err_unlock: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; + +err_wedged: + igt_spinner_end(&hi.spin); + igt_spinner_end(&lo.spin); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_client_lo; +} + +static int live_preempt_hang(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx_hi, *ctx_lo; + struct igt_spinner spin_hi, spin_lo; + struct intel_engine_cs *engine; + enum intel_engine_id id; + intel_wakeref_t wakeref; + int err = -ENOMEM; + + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) + return 0; + + if (!intel_has_reset_engine(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + if (igt_spinner_init(&spin_hi, i915)) + goto err_unlock; + + if (igt_spinner_init(&spin_lo, i915)) + goto err_spin_hi; + + ctx_hi = kernel_context(i915); + if (!ctx_hi) + goto err_spin_lo; + ctx_hi->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MAX_USER_PRIORITY); + + ctx_lo = kernel_context(i915); + if (!ctx_lo) + goto err_ctx_hi; + ctx_lo->sched.priority = + I915_USER_PRIORITY(I915_CONTEXT_MIN_USER_PRIORITY); + + for_each_engine(engine, i915, id) { + struct i915_request *rq; + + if (!intel_engine_has_preemption(engine)) + continue; + + rq = igt_spinner_create_request(&spin_lo, ctx_lo, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + i915_request_add(rq); + if (!igt_wait_for_spinner(&spin_lo, rq)) { + GEM_TRACE("lo spinner failed to start\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; + } + + rq = igt_spinner_create_request(&spin_hi, ctx_hi, engine, + MI_ARB_CHECK); + if (IS_ERR(rq)) { + igt_spinner_end(&spin_lo); + err = PTR_ERR(rq); + goto err_ctx_lo; + } + + init_completion(&engine->execlists.preempt_hang.completion); + engine->execlists.preempt_hang.inject_hang = true; + + i915_request_add(rq); + + if (!wait_for_completion_timeout(&engine->execlists.preempt_hang.completion, + HZ / 10)) { + pr_err("Preemption did not occur within timeout!"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; + } + + set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + i915_reset_engine(engine, NULL); + clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags); + + engine->execlists.preempt_hang.inject_hang = false; + + if (!igt_wait_for_spinner(&spin_hi, rq)) { + GEM_TRACE("hi spinner failed to start\n"); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto err_ctx_lo; + } + + igt_spinner_end(&spin_hi); + igt_spinner_end(&spin_lo); + if (igt_flush_test(i915, I915_WAIT_LOCKED)) { + err = -EIO; + goto err_ctx_lo; + } + } + + err = 0; +err_ctx_lo: + kernel_context_close(ctx_lo); +err_ctx_hi: + kernel_context_close(ctx_hi); +err_spin_lo: + igt_spinner_fini(&spin_lo); +err_spin_hi: + igt_spinner_fini(&spin_hi); +err_unlock: + igt_flush_test(i915, I915_WAIT_LOCKED); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int random_range(struct rnd_state *rnd, int min, int max) +{ + return i915_prandom_u32_max_state(max - min, rnd) + min; +} + +static int random_priority(struct rnd_state *rnd) +{ + return random_range(rnd, I915_PRIORITY_MIN, I915_PRIORITY_MAX); +} + +struct preempt_smoke { + struct drm_i915_private *i915; + struct i915_gem_context **contexts; + struct intel_engine_cs *engine; + struct drm_i915_gem_object *batch; + unsigned int ncontext; + struct rnd_state prng; + unsigned long count; +}; + +static struct i915_gem_context *smoke_context(struct preempt_smoke *smoke) +{ + return smoke->contexts[i915_prandom_u32_max_state(smoke->ncontext, + &smoke->prng)]; +} + +static int smoke_submit(struct preempt_smoke *smoke, + struct i915_gem_context *ctx, int prio, + struct drm_i915_gem_object *batch) +{ + struct i915_request *rq; + struct i915_vma *vma = NULL; + int err = 0; + + if (batch) { + vma = i915_vma_instance(batch, ctx->vm, NULL); + if (IS_ERR(vma)) + return PTR_ERR(vma); + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + return err; + } + + ctx->sched.priority = prio; + + rq = igt_request_alloc(ctx, smoke->engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto unpin; + } + + if (vma) { + i915_vma_lock(vma); + err = rq->engine->emit_bb_start(rq, + vma->node.start, + PAGE_SIZE, 0); + if (!err) + err = i915_vma_move_to_active(vma, rq, 0); + i915_vma_unlock(vma); + } + + i915_request_add(rq); + +unpin: + if (vma) + i915_vma_unpin(vma); + + return err; +} + +static int smoke_crescendo_thread(void *arg) +{ + struct preempt_smoke *smoke = arg; + IGT_TIMEOUT(end_time); + unsigned long count; + + count = 0; + do { + struct i915_gem_context *ctx = smoke_context(smoke); + int err; + + mutex_lock(&smoke->i915->drm.struct_mutex); + err = smoke_submit(smoke, + ctx, count % I915_PRIORITY_MAX, + smoke->batch); + mutex_unlock(&smoke->i915->drm.struct_mutex); + if (err) + return err; + + count++; + } while (!__igt_timeout(end_time, NULL)); + + smoke->count = count; + return 0; +} + +static int smoke_crescendo(struct preempt_smoke *smoke, unsigned int flags) +#define BATCH BIT(0) +{ + struct task_struct *tsk[I915_NUM_ENGINES] = {}; + struct preempt_smoke arg[I915_NUM_ENGINES]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned long count; + int err = 0; + + mutex_unlock(&smoke->i915->drm.struct_mutex); + + for_each_engine(engine, smoke->i915, id) { + arg[id] = *smoke; + arg[id].engine = engine; + if (!(flags & BATCH)) + arg[id].batch = NULL; + arg[id].count = 0; + + tsk[id] = kthread_run(smoke_crescendo_thread, &arg, + "igt/smoke:%d", id); + if (IS_ERR(tsk[id])) { + err = PTR_ERR(tsk[id]); + break; + } + get_task_struct(tsk[id]); + } + + count = 0; + for_each_engine(engine, smoke->i915, id) { + int status; + + if (IS_ERR_OR_NULL(tsk[id])) + continue; + + status = kthread_stop(tsk[id]); + if (status && !err) + err = status; + + count += arg[id].count; + + put_task_struct(tsk[id]); + } + + mutex_lock(&smoke->i915->drm.struct_mutex); + + pr_info("Submitted %lu crescendo:%x requests across %d engines and %d contexts\n", + count, flags, + RUNTIME_INFO(smoke->i915)->num_engines, smoke->ncontext); + return 0; +} + +static int smoke_random(struct preempt_smoke *smoke, unsigned int flags) +{ + enum intel_engine_id id; + IGT_TIMEOUT(end_time); + unsigned long count; + + count = 0; + do { + for_each_engine(smoke->engine, smoke->i915, id) { + struct i915_gem_context *ctx = smoke_context(smoke); + int err; + + err = smoke_submit(smoke, + ctx, random_priority(&smoke->prng), + flags & BATCH ? smoke->batch : NULL); + if (err) + return err; + + count++; + } + } while (!__igt_timeout(end_time, NULL)); + + pr_info("Submitted %lu random:%x requests across %d engines and %d contexts\n", + count, flags, + RUNTIME_INFO(smoke->i915)->num_engines, smoke->ncontext); + return 0; +} + +static int live_preempt_smoke(void *arg) +{ + struct preempt_smoke smoke = { + .i915 = arg, + .prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed), + .ncontext = 1024, + }; + const unsigned int phase[] = { 0, BATCH }; + intel_wakeref_t wakeref; + struct igt_live_test t; + int err = -ENOMEM; + u32 *cs; + int n; + + if (!HAS_LOGICAL_RING_PREEMPTION(smoke.i915)) + return 0; + + smoke.contexts = kmalloc_array(smoke.ncontext, + sizeof(*smoke.contexts), + GFP_KERNEL); + if (!smoke.contexts) + return -ENOMEM; + + mutex_lock(&smoke.i915->drm.struct_mutex); + wakeref = intel_runtime_pm_get(&smoke.i915->runtime_pm); + + smoke.batch = i915_gem_object_create_internal(smoke.i915, PAGE_SIZE); + if (IS_ERR(smoke.batch)) { + err = PTR_ERR(smoke.batch); + goto err_unlock; + } + + cs = i915_gem_object_pin_map(smoke.batch, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_batch; + } + for (n = 0; n < PAGE_SIZE / sizeof(*cs) - 1; n++) + cs[n] = MI_ARB_CHECK; + cs[n] = MI_BATCH_BUFFER_END; + i915_gem_object_flush_map(smoke.batch); + i915_gem_object_unpin_map(smoke.batch); + + if (igt_live_test_begin(&t, smoke.i915, __func__, "all")) { + err = -EIO; + goto err_batch; + } + + for (n = 0; n < smoke.ncontext; n++) { + smoke.contexts[n] = kernel_context(smoke.i915); + if (!smoke.contexts[n]) + goto err_ctx; + } + + for (n = 0; n < ARRAY_SIZE(phase); n++) { + err = smoke_crescendo(&smoke, phase[n]); + if (err) + goto err_ctx; + + err = smoke_random(&smoke, phase[n]); + if (err) + goto err_ctx; + } + +err_ctx: + if (igt_live_test_end(&t)) + err = -EIO; + + for (n = 0; n < smoke.ncontext; n++) { + if (!smoke.contexts[n]) + break; + kernel_context_close(smoke.contexts[n]); + } + +err_batch: + i915_gem_object_put(smoke.batch); +err_unlock: + intel_runtime_pm_put(&smoke.i915->runtime_pm, wakeref); + mutex_unlock(&smoke.i915->drm.struct_mutex); + kfree(smoke.contexts); + + return err; +} + +static int nop_virtual_engine(struct drm_i915_private *i915, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int nctx, + unsigned int flags) +#define CHAIN BIT(0) +{ + IGT_TIMEOUT(end_time); + struct i915_request *request[16]; + struct i915_gem_context *ctx[16]; + struct intel_context *ve[16]; + unsigned long n, prime, nc; + struct igt_live_test t; + ktime_t times[2] = {}; + int err; + + GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx)); + + for (n = 0; n < nctx; n++) { + ctx[n] = kernel_context(i915); + if (!ctx[n]) { + err = -ENOMEM; + nctx = n; + goto out; + } + + ve[n] = intel_execlists_create_virtual(ctx[n], + siblings, nsibling); + if (IS_ERR(ve[n])) { + kernel_context_close(ctx[n]); + err = PTR_ERR(ve[n]); + nctx = n; + goto out; + } + + err = intel_context_pin(ve[n]); + if (err) { + intel_context_put(ve[n]); + kernel_context_close(ctx[n]); + nctx = n; + goto out; + } + } + + err = igt_live_test_begin(&t, i915, __func__, ve[0]->engine->name); + if (err) + goto out; + + for_each_prime_number_from(prime, 1, 8192) { + times[1] = ktime_get_raw(); + + if (flags & CHAIN) { + for (nc = 0; nc < nctx; nc++) { + for (n = 0; n < prime; n++) { + request[nc] = + i915_request_create(ve[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } else { + for (n = 0; n < prime; n++) { + for (nc = 0; nc < nctx; nc++) { + request[nc] = + i915_request_create(ve[nc]); + if (IS_ERR(request[nc])) { + err = PTR_ERR(request[nc]); + goto out; + } + + i915_request_add(request[nc]); + } + } + } + + for (nc = 0; nc < nctx; nc++) { + if (i915_request_wait(request[nc], 0, HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve[0]->engine->name, + request[nc]->fence.context, + request[nc]->fence.seqno); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + break; + } + } + + times[1] = ktime_sub(ktime_get_raw(), times[1]); + if (prime == 1) + times[0] = times[1]; + + if (__igt_timeout(end_time, NULL)) + break; + } + + err = igt_live_test_end(&t); + if (err) + goto out; + + pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n", + nctx, ve[0]->engine->name, ktime_to_ns(times[0]), + prime, div64_u64(ktime_to_ns(times[1]), prime)); + +out: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + for (nc = 0; nc < nctx; nc++) { + intel_context_unpin(ve[nc]); + intel_context_put(ve[nc]); + kernel_context_close(ctx[nc]); + } + return err; +} + +static int live_virtual_engine(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + struct intel_engine_cs *engine; + enum intel_engine_id id; + unsigned int class, inst; + int err = -ENODEV; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for_each_engine(engine, i915, id) { + err = nop_virtual_engine(i915, &engine, 1, 1, 0); + if (err) { + pr_err("Failed to wrap engine %s: err=%d\n", + engine->name, err); + goto out_unlock; + } + } + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + int nsibling, n; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + continue; + + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (n = 1; n <= nsibling + 1; n++) { + err = nop_virtual_engine(i915, siblings, nsibling, + n, 0); + if (err) + goto out_unlock; + } + + err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN); + if (err) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int mask_virtual_engine(struct drm_i915_private *i915, + struct intel_engine_cs **siblings, + unsigned int nsibling) +{ + struct i915_request *request[MAX_ENGINE_INSTANCE + 1]; + struct i915_gem_context *ctx; + struct intel_context *ve; + struct igt_live_test t; + unsigned int n; + int err; + + /* + * Check that by setting the execution mask on a request, we can + * restrict it to our desired engine within the virtual engine. + */ + + ctx = kernel_context(i915); + if (!ctx) + return -ENOMEM; + + ve = intel_execlists_create_virtual(ctx, siblings, nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + goto out_close; + } + + err = intel_context_pin(ve); + if (err) + goto out_put; + + err = igt_live_test_begin(&t, i915, __func__, ve->engine->name); + if (err) + goto out_unpin; + + for (n = 0; n < nsibling; n++) { + request[n] = i915_request_create(ve); + if (IS_ERR(request[n])) { + err = PTR_ERR(request[n]); + nsibling = n; + goto out; + } + + /* Reverse order as it's more likely to be unnatural */ + request[n]->execution_mask = siblings[nsibling - n - 1]->mask; + + i915_request_get(request[n]); + i915_request_add(request[n]); + } + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(request[n], 0, HZ / 10) < 0) { + pr_err("%s(%s): wait for %llx:%lld timed out\n", + __func__, ve->engine->name, + request[n]->fence.context, + request[n]->fence.seqno); + + GEM_TRACE("%s(%s) failed at request %llx:%lld\n", + __func__, ve->engine->name, + request[n]->fence.context, + request[n]->fence.seqno); + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + err = -EIO; + goto out; + } + + if (request[n]->engine != siblings[nsibling - n - 1]) { + pr_err("Executed on wrong sibling '%s', expected '%s'\n", + request[n]->engine->name, + siblings[nsibling - n - 1]->name); + err = -EINVAL; + goto out; + } + } + + err = igt_live_test_end(&t); + if (err) + goto out; + +out: + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + for (n = 0; n < nsibling; n++) + i915_request_put(request[n]); + +out_unpin: + intel_context_unpin(ve); +out_put: + intel_context_put(ve); +out_close: + kernel_context_close(ctx); + return err; +} + +static int live_virtual_mask(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class, inst; + int err = 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + unsigned int nsibling; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + break; + + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + err = mask_virtual_engine(i915, siblings, nsibling); + if (err) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +static int bond_virtual_engine(struct drm_i915_private *i915, + unsigned int class, + struct intel_engine_cs **siblings, + unsigned int nsibling, + unsigned int flags) +#define BOND_SCHEDULE BIT(0) +{ + struct intel_engine_cs *master; + struct i915_gem_context *ctx; + struct i915_request *rq[16]; + enum intel_engine_id id; + unsigned long n; + int err; + + GEM_BUG_ON(nsibling >= ARRAY_SIZE(rq) - 1); + + ctx = kernel_context(i915); + if (!ctx) + return -ENOMEM; + + err = 0; + rq[0] = ERR_PTR(-ENOMEM); + for_each_engine(master, i915, id) { + struct i915_sw_fence fence = {}; + + if (master->class == class) + continue; + + memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq)); + + rq[0] = igt_request_alloc(ctx, master); + if (IS_ERR(rq[0])) { + err = PTR_ERR(rq[0]); + goto out; + } + i915_request_get(rq[0]); + + if (flags & BOND_SCHEDULE) { + onstack_fence_init(&fence); + err = i915_sw_fence_await_sw_fence_gfp(&rq[0]->submit, + &fence, + GFP_KERNEL); + } + i915_request_add(rq[0]); + if (err < 0) + goto out; + + for (n = 0; n < nsibling; n++) { + struct intel_context *ve; + + ve = intel_execlists_create_virtual(ctx, + siblings, + nsibling); + if (IS_ERR(ve)) { + err = PTR_ERR(ve); + onstack_fence_fini(&fence); + goto out; + } + + err = intel_virtual_engine_attach_bond(ve->engine, + master, + siblings[n]); + if (err) { + intel_context_put(ve); + onstack_fence_fini(&fence); + goto out; + } + + err = intel_context_pin(ve); + intel_context_put(ve); + if (err) { + onstack_fence_fini(&fence); + goto out; + } + + rq[n + 1] = i915_request_create(ve); + intel_context_unpin(ve); + if (IS_ERR(rq[n + 1])) { + err = PTR_ERR(rq[n + 1]); + onstack_fence_fini(&fence); + goto out; + } + i915_request_get(rq[n + 1]); + + err = i915_request_await_execution(rq[n + 1], + &rq[0]->fence, + ve->engine->bond_execute); + i915_request_add(rq[n + 1]); + if (err < 0) { + onstack_fence_fini(&fence); + goto out; + } + } + onstack_fence_fini(&fence); + + if (i915_request_wait(rq[0], 0, HZ / 10) < 0) { + pr_err("Master request did not execute (on %s)!\n", + rq[0]->engine->name); + err = -EIO; + goto out; + } + + for (n = 0; n < nsibling; n++) { + if (i915_request_wait(rq[n + 1], 0, + MAX_SCHEDULE_TIMEOUT) < 0) { + err = -EIO; + goto out; + } + + if (rq[n + 1]->engine != siblings[n]) { + pr_err("Bonded request did not execute on target engine: expected %s, used %s; master was %s\n", + siblings[n]->name, + rq[n + 1]->engine->name, + rq[0]->engine->name); + err = -EINVAL; + goto out; + } + } + + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + rq[0] = ERR_PTR(-ENOMEM); + } + +out: + for (n = 0; !IS_ERR(rq[n]); n++) + i915_request_put(rq[n]); + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + kernel_context_close(ctx); + return err; +} + +static int live_virtual_bond(void *arg) +{ + static const struct phase { + const char *name; + unsigned int flags; + } phases[] = { + { "", 0 }, + { "schedule", BOND_SCHEDULE }, + { }, + }; + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1]; + unsigned int class, inst; + int err = 0; + + if (USES_GUC_SUBMISSION(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + + for (class = 0; class <= MAX_ENGINE_CLASS; class++) { + const struct phase *p; + int nsibling; + + nsibling = 0; + for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) { + if (!i915->engine_class[class][inst]) + break; + + GEM_BUG_ON(nsibling == ARRAY_SIZE(siblings)); + siblings[nsibling++] = i915->engine_class[class][inst]; + } + if (nsibling < 2) + continue; + + for (p = phases; p->name; p++) { + err = bond_virtual_engine(i915, + class, siblings, nsibling, + p->flags); + if (err) { + pr_err("%s(%s): failed class=%d, nsibling=%d, err=%d\n", + __func__, p->name, class, nsibling, err); + goto out_unlock; + } + } + } + +out_unlock: + mutex_unlock(&i915->drm.struct_mutex); + return err; +} + +int intel_execlists_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_sanitycheck), + SUBTEST(live_busywait_preempt), + SUBTEST(live_preempt), + SUBTEST(live_late_preempt), + SUBTEST(live_suppress_self_preempt), + SUBTEST(live_suppress_wait_preempt), + SUBTEST(live_chain_preempt), + SUBTEST(live_preempt_hang), + SUBTEST(live_preempt_smoke), + SUBTEST(live_virtual_engine), + SUBTEST(live_virtual_mask), + SUBTEST(live_virtual_bond), + }; + + if (!HAS_EXECLISTS(i915)) + return 0; + + if (i915_terminally_wedged(i915)) + return 0; + + return i915_subtests(tests, i915); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_reset.c b/drivers/gpu/drm/i915/gt/selftest_reset.c new file mode 100644 index 000000000000..89da9e7cc1ba --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_reset.c @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2018 Intel Corporation + */ + +#include "i915_selftest.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_atomic.h" + +static int igt_global_reset(void *arg) +{ + struct drm_i915_private *i915 = arg; + unsigned int reset_count; + int err = 0; + + /* Check that we can issue a global GPU reset */ + + igt_global_reset_lock(i915); + + reset_count = i915_reset_count(&i915->gpu_error); + + i915_reset(i915, ALL_ENGINES, NULL); + + if (i915_reset_count(&i915->gpu_error) == reset_count) { + pr_err("No GPU reset recorded!\n"); + err = -EINVAL; + } + + igt_global_reset_unlock(i915); + + if (i915_reset_failed(i915)) + err = -EIO; + + return err; +} + +static int igt_wedged_reset(void *arg) +{ + struct drm_i915_private *i915 = arg; + intel_wakeref_t wakeref; + + /* Check that we can recover a wedged device with a GPU reset */ + + igt_global_reset_lock(i915); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + i915_gem_set_wedged(i915); + + GEM_BUG_ON(!i915_reset_failed(i915)); + i915_reset(i915, ALL_ENGINES, NULL); + + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + igt_global_reset_unlock(i915); + + return i915_reset_failed(i915) ? -EIO : 0; +} + +static int igt_atomic_reset(void *arg) +{ + struct drm_i915_private *i915 = arg; + const typeof(*igt_atomic_phases) *p; + int err = 0; + + /* Check that the resets are usable from atomic context */ + + igt_global_reset_lock(i915); + mutex_lock(&i915->drm.struct_mutex); + + /* Flush any requests before we get started and check basics */ + if (!igt_force_reset(i915)) + goto unlock; + + for (p = igt_atomic_phases; p->name; p++) { + GEM_TRACE("intel_gpu_reset under %s\n", p->name); + + p->critical_section_begin(); + reset_prepare(i915); + err = intel_gpu_reset(i915, ALL_ENGINES); + reset_finish(i915); + p->critical_section_end(); + + if (err) { + pr_err("intel_gpu_reset failed under %s\n", p->name); + break; + } + } + + /* As we poke around the guts, do a full reset before continuing. */ + igt_force_reset(i915); + +unlock: + mutex_unlock(&i915->drm.struct_mutex); + igt_global_reset_unlock(i915); + + return err; +} + +int intel_reset_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_global_reset), /* attempt to recover GPU first */ + SUBTEST(igt_wedged_reset), + SUBTEST(igt_atomic_reset), + }; + intel_wakeref_t wakeref; + int err = 0; + + if (!intel_has_gpu_reset(i915)) + return 0; + + if (i915_terminally_wedged(i915)) + return -EIO; /* we're long past hope of a successful reset */ + + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + err = i915_subtests(tests, i915); + + return err; +} diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c new file mode 100644 index 000000000000..9eaf030affd0 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c @@ -0,0 +1,1220 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright © 2018 Intel Corporation + */ + +#include "gem/i915_gem_pm.h" +#include "i915_selftest.h" +#include "intel_reset.h" + +#include "selftests/igt_flush_test.h" +#include "selftests/igt_reset.h" +#include "selftests/igt_spinner.h" +#include "selftests/igt_wedge_me.h" +#include "selftests/mock_drm.h" + +#include "gem/selftests/igt_gem_utils.h" +#include "gem/selftests/mock_context.h" + +static const struct wo_register { + enum intel_platform platform; + u32 reg; +} wo_registers[] = { + { INTEL_GEMINILAKE, 0x731c } +}; + +#define REF_NAME_MAX (INTEL_ENGINE_CS_MAX_NAME + 8) +struct wa_lists { + struct i915_wa_list gt_wa_list; + struct { + char name[REF_NAME_MAX]; + struct i915_wa_list wa_list; + struct i915_wa_list ctx_wa_list; + } engine[I915_NUM_ENGINES]; +}; + +static void +reference_lists_init(struct drm_i915_private *i915, struct wa_lists *lists) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + memset(lists, 0, sizeof(*lists)); + + wa_init_start(&lists->gt_wa_list, "GT_REF"); + gt_init_workarounds(i915, &lists->gt_wa_list); + wa_init_finish(&lists->gt_wa_list); + + for_each_engine(engine, i915, id) { + struct i915_wa_list *wal = &lists->engine[id].wa_list; + char *name = lists->engine[id].name; + + snprintf(name, REF_NAME_MAX, "%s_REF", engine->name); + + wa_init_start(wal, name); + engine_init_workarounds(engine, wal); + wa_init_finish(wal); + + snprintf(name, REF_NAME_MAX, "%s_CTX_REF", engine->name); + + __intel_engine_init_ctx_wa(engine, + &lists->engine[id].ctx_wa_list, + name); + } +} + +static void +reference_lists_fini(struct drm_i915_private *i915, struct wa_lists *lists) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) + intel_wa_list_free(&lists->engine[id].wa_list); + + intel_wa_list_free(&lists->gt_wa_list); +} + +static struct drm_i915_gem_object * +read_nonprivs(struct i915_gem_context *ctx, struct intel_engine_cs *engine) +{ + const u32 base = engine->mmio_base; + struct drm_i915_gem_object *result; + struct i915_request *rq; + struct i915_vma *vma; + u32 srm, *cs; + int err; + int i; + + result = i915_gem_object_create_internal(engine->i915, PAGE_SIZE); + if (IS_ERR(result)) + return result; + + i915_gem_object_set_cache_coherency(result, I915_CACHE_LLC); + + cs = i915_gem_object_pin_map(result, I915_MAP_WB); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_obj; + } + memset(cs, 0xc5, PAGE_SIZE); + i915_gem_object_flush_map(result); + i915_gem_object_unpin_map(result); + + vma = i915_vma_instance(result, &engine->i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL); + if (err) + goto err_obj; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_pin; + } + + i915_vma_lock(vma); + err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); + i915_vma_unlock(vma); + if (err) + goto err_req; + + srm = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; + if (INTEL_GEN(ctx->i915) >= 8) + srm++; + + cs = intel_ring_begin(rq, 4 * RING_MAX_NONPRIV_SLOTS); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_req; + } + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + *cs++ = srm; + *cs++ = i915_mmio_reg_offset(RING_FORCE_TO_NONPRIV(base, i)); + *cs++ = i915_ggtt_offset(vma) + sizeof(u32) * i; + *cs++ = 0; + } + intel_ring_advance(rq, cs); + + i915_request_add(rq); + i915_vma_unpin(vma); + + return result; + +err_req: + i915_request_add(rq); +err_pin: + i915_vma_unpin(vma); +err_obj: + i915_gem_object_put(result); + return ERR_PTR(err); +} + +static u32 +get_whitelist_reg(const struct intel_engine_cs *engine, unsigned int i) +{ + i915_reg_t reg = i < engine->whitelist.count ? + engine->whitelist.list[i].reg : + RING_NOPID(engine->mmio_base); + + return i915_mmio_reg_offset(reg); +} + +static void +print_results(const struct intel_engine_cs *engine, const u32 *results) +{ + unsigned int i; + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + u32 expected = get_whitelist_reg(engine, i); + u32 actual = results[i]; + + pr_info("RING_NONPRIV[%d]: expected 0x%08x, found 0x%08x\n", + i, expected, actual); + } +} + +static int check_whitelist(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + struct drm_i915_gem_object *results; + struct igt_wedge_me wedge; + u32 *vaddr; + int err; + int i; + + results = read_nonprivs(ctx, engine); + if (IS_ERR(results)) + return PTR_ERR(results); + + err = 0; + i915_gem_object_lock(results); + igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */ + err = i915_gem_object_set_to_cpu_domain(results, false); + i915_gem_object_unlock(results); + if (i915_terminally_wedged(ctx->i915)) + err = -EIO; + if (err) + goto out_put; + + vaddr = i915_gem_object_pin_map(results, I915_MAP_WB); + if (IS_ERR(vaddr)) { + err = PTR_ERR(vaddr); + goto out_put; + } + + for (i = 0; i < RING_MAX_NONPRIV_SLOTS; i++) { + u32 expected = get_whitelist_reg(engine, i); + u32 actual = vaddr[i]; + + if (expected != actual) { + print_results(engine, vaddr); + pr_err("Invalid RING_NONPRIV[%d], expected 0x%08x, found 0x%08x\n", + i, expected, actual); + + err = -EINVAL; + break; + } + } + + i915_gem_object_unpin_map(results); +out_put: + i915_gem_object_put(results); + return err; +} + +static int do_device_reset(struct intel_engine_cs *engine) +{ + i915_reset(engine->i915, engine->mask, "live_workarounds"); + return 0; +} + +static int do_engine_reset(struct intel_engine_cs *engine) +{ + return i915_reset_engine(engine, "live_workarounds"); +} + +static int +switch_to_scratch_context(struct intel_engine_cs *engine, + struct igt_spinner *spin) +{ + struct i915_gem_context *ctx; + struct i915_request *rq; + intel_wakeref_t wakeref; + int err = 0; + + ctx = kernel_context(engine->i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + GEM_BUG_ON(i915_gem_context_is_bannable(ctx)); + + rq = ERR_PTR(-ENODEV); + with_intel_runtime_pm(&engine->i915->runtime_pm, wakeref) + rq = igt_spinner_create_request(spin, ctx, engine, MI_NOOP); + + kernel_context_close(ctx); + + if (IS_ERR(rq)) { + spin = NULL; + err = PTR_ERR(rq); + goto err; + } + + i915_request_add(rq); + + if (spin && !igt_wait_for_spinner(spin, rq)) { + pr_err("Spinner failed to start\n"); + err = -ETIMEDOUT; + } + +err: + if (err && spin) + igt_spinner_end(spin); + + return err; +} + +static int check_whitelist_across_reset(struct intel_engine_cs *engine, + int (*reset)(struct intel_engine_cs *), + const char *name) +{ + struct drm_i915_private *i915 = engine->i915; + struct i915_gem_context *ctx; + struct igt_spinner spin; + intel_wakeref_t wakeref; + int err; + + pr_info("Checking %d whitelisted registers (RING_NONPRIV) [%s]\n", + engine->whitelist.count, name); + + err = igt_spinner_init(&spin, i915); + if (err) + return err; + + ctx = kernel_context(i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Invalid whitelist *before* %s reset!\n", name); + goto out; + } + + err = switch_to_scratch_context(engine, &spin); + if (err) + goto out; + + with_intel_runtime_pm(&i915->runtime_pm, wakeref) + err = reset(engine); + + igt_spinner_end(&spin); + igt_spinner_fini(&spin); + + if (err) { + pr_err("%s reset failed\n", name); + goto out; + } + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Whitelist not preserved in context across %s reset!\n", + name); + goto out; + } + + kernel_context_close(ctx); + + ctx = kernel_context(i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + err = check_whitelist(ctx, engine); + if (err) { + pr_err("Invalid whitelist *after* %s reset in fresh context!\n", + name); + goto out; + } + +out: + kernel_context_close(ctx); + return err; +} + +static struct i915_vma *create_batch(struct i915_gem_context *ctx) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int err; + + obj = i915_gem_object_create_internal(ctx->i915, 16 * PAGE_SIZE); + if (IS_ERR(obj)) + return ERR_CAST(obj); + + vma = i915_vma_instance(obj, ctx->vm, NULL); + if (IS_ERR(vma)) { + err = PTR_ERR(vma); + goto err_obj; + } + + err = i915_vma_pin(vma, 0, 0, PIN_USER); + if (err) + goto err_obj; + + return vma; + +err_obj: + i915_gem_object_put(obj); + return ERR_PTR(err); +} + +static u32 reg_write(u32 old, u32 new, u32 rsvd) +{ + if (rsvd == 0x0000ffff) { + old &= ~(new >> 16); + old |= new & (new >> 16); + } else { + old &= ~rsvd; + old |= new & rsvd; + } + + return old; +} + +static bool wo_register(struct intel_engine_cs *engine, u32 reg) +{ + enum intel_platform platform = INTEL_INFO(engine->i915)->platform; + int i; + + for (i = 0; i < ARRAY_SIZE(wo_registers); i++) { + if (wo_registers[i].platform == platform && + wo_registers[i].reg == reg) + return true; + } + + return false; +} + +static bool ro_register(u32 reg) +{ + if (reg & RING_FORCE_TO_NONPRIV_RD) + return true; + + return false; +} + +static int whitelist_writable_count(struct intel_engine_cs *engine) +{ + int count = engine->whitelist.count; + int i; + + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + if (ro_register(reg)) + count--; + } + + return count; +} + +static int check_dirty_whitelist(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + const u32 values[] = { + 0x00000000, + 0x01010101, + 0x10100101, + 0x03030303, + 0x30300303, + 0x05050505, + 0x50500505, + 0x0f0f0f0f, + 0xf00ff00f, + 0x10101010, + 0xf0f01010, + 0x30303030, + 0xa0a03030, + 0x50505050, + 0xc0c05050, + 0xf0f0f0f0, + 0x11111111, + 0x33333333, + 0x55555555, + 0x0000ffff, + 0x00ff00ff, + 0xff0000ff, + 0xffff00ff, + 0xffffffff, + }; + struct i915_vma *scratch; + struct i915_vma *batch; + int err = 0, i, v; + u32 *cs, *results; + + scratch = create_scratch(ctx->vm, 2 * ARRAY_SIZE(values) + 1); + if (IS_ERR(scratch)) + return PTR_ERR(scratch); + + batch = create_batch(ctx); + if (IS_ERR(batch)) { + err = PTR_ERR(batch); + goto out_scratch; + } + + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + u64 addr = scratch->node.start; + struct i915_request *rq; + u32 srm, lrm, rsvd; + u32 expect; + int idx; + + if (wo_register(engine, reg)) + continue; + + if (ro_register(reg)) + continue; + + srm = MI_STORE_REGISTER_MEM; + lrm = MI_LOAD_REGISTER_MEM; + if (INTEL_GEN(ctx->i915) >= 8) + lrm++, srm++; + + pr_debug("%s: Writing garbage to %x\n", + engine->name, reg); + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto out_batch; + } + + /* SRM original */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr); + *cs++ = upper_32_bits(addr); + + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + /* LRI garbage */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = reg; + *cs++ = values[v]; + + /* SRM result */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr + sizeof(u32) * idx); + *cs++ = upper_32_bits(addr + sizeof(u32) * idx); + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + /* LRI garbage */ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = reg; + *cs++ = ~values[v]; + + /* SRM result */ + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(addr + sizeof(u32) * idx); + *cs++ = upper_32_bits(addr + sizeof(u32) * idx); + idx++; + } + GEM_BUG_ON(idx * sizeof(u32) > scratch->size); + + /* LRM original -- don't leave garbage in the context! */ + *cs++ = lrm; + *cs++ = reg; + *cs++ = lower_32_bits(addr); + *cs++ = upper_32_bits(addr); + + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(batch->obj); + i915_gem_object_unpin_map(batch->obj); + i915_gem_chipset_flush(ctx->i915); + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_batch; + } + + if (engine->emit_init_breadcrumb) { /* Be nice if we hang */ + err = engine->emit_init_breadcrumb(rq); + if (err) + goto err_request; + } + + err = engine->emit_bb_start(rq, + batch->node.start, PAGE_SIZE, + 0); + if (err) + goto err_request; + +err_request: + i915_request_add(rq); + if (err) + goto out_batch; + + if (i915_request_wait(rq, 0, HZ / 5) < 0) { + pr_err("%s: Futzing %x timedout; cancelling test\n", + engine->name, reg); + i915_gem_set_wedged(ctx->i915); + err = -EIO; + goto out_batch; + } + + results = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB); + if (IS_ERR(results)) { + err = PTR_ERR(results); + goto out_batch; + } + + GEM_BUG_ON(values[ARRAY_SIZE(values) - 1] != 0xffffffff); + rsvd = results[ARRAY_SIZE(values)]; /* detect write masking */ + if (!rsvd) { + pr_err("%s: Unable to write to whitelisted register %x\n", + engine->name, reg); + err = -EINVAL; + goto out_unpin; + } + + expect = results[0]; + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + expect = reg_write(expect, values[v], rsvd); + if (results[idx] != expect) + err++; + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + expect = reg_write(expect, ~values[v], rsvd); + if (results[idx] != expect) + err++; + idx++; + } + if (err) { + pr_err("%s: %d mismatch between values written to whitelisted register [%x], and values read back!\n", + engine->name, err, reg); + + pr_info("%s: Whitelisted register: %x, original value %08x, rsvd %08x\n", + engine->name, reg, results[0], rsvd); + + expect = results[0]; + idx = 1; + for (v = 0; v < ARRAY_SIZE(values); v++) { + u32 w = values[v]; + + expect = reg_write(expect, w, rsvd); + pr_info("Wrote %08x, read %08x, expect %08x\n", + w, results[idx], expect); + idx++; + } + for (v = 0; v < ARRAY_SIZE(values); v++) { + u32 w = ~values[v]; + + expect = reg_write(expect, w, rsvd); + pr_info("Wrote %08x, read %08x, expect %08x\n", + w, results[idx], expect); + idx++; + } + + err = -EINVAL; + } +out_unpin: + i915_gem_object_unpin_map(scratch->obj); + if (err) + break; + } + + if (igt_flush_test(ctx->i915, I915_WAIT_LOCKED)) + err = -EIO; +out_batch: + i915_vma_unpin_and_release(&batch, 0); +out_scratch: + i915_vma_unpin_and_release(&scratch, 0); + return err; +} + +static int live_dirty_whitelist(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_gem_context *ctx; + enum intel_engine_id id; + intel_wakeref_t wakeref; + struct drm_file *file; + int err = 0; + + /* Can the user write to the whitelisted registers? */ + + if (INTEL_GEN(i915) < 7) /* minimum requirement for LRI, SRM, LRM */ + return 0; + + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + mutex_unlock(&i915->drm.struct_mutex); + file = mock_file(i915); + mutex_lock(&i915->drm.struct_mutex); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto out_rpm; + } + + ctx = live_context(i915, file); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto out_file; + } + + for_each_engine(engine, i915, id) { + if (engine->whitelist.count == 0) + continue; + + err = check_dirty_whitelist(ctx, engine); + if (err) + goto out_file; + } + +out_file: + mutex_unlock(&i915->drm.struct_mutex); + mock_file_free(i915, file); + mutex_lock(&i915->drm.struct_mutex); +out_rpm: + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + return err; +} + +static int live_reset_whitelist(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine = i915->engine[RCS0]; + int err = 0; + + /* If we reset the gpu, we should not lose the RING_NONPRIV */ + + if (!engine || engine->whitelist.count == 0) + return 0; + + igt_global_reset_lock(i915); + + if (intel_has_reset_engine(i915)) { + err = check_whitelist_across_reset(engine, + do_engine_reset, + "engine"); + if (err) + goto out; + } + + if (intel_has_gpu_reset(i915)) { + err = check_whitelist_across_reset(engine, + do_device_reset, + "device"); + if (err) + goto out; + } + +out: + igt_global_reset_unlock(i915); + return err; +} + +static int read_whitelisted_registers(struct i915_gem_context *ctx, + struct intel_engine_cs *engine, + struct i915_vma *results) +{ + struct i915_request *rq; + int i, err = 0; + u32 srm, *cs; + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + srm = MI_STORE_REGISTER_MEM; + if (INTEL_GEN(ctx->i915) >= 8) + srm++; + + cs = intel_ring_begin(rq, 4 * engine->whitelist.count); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_req; + } + + for (i = 0; i < engine->whitelist.count; i++) { + u64 offset = results->node.start + sizeof(u32) * i; + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + /* Clear RD only and WR only flags */ + reg &= ~(RING_FORCE_TO_NONPRIV_RD | RING_FORCE_TO_NONPRIV_WR); + + *cs++ = srm; + *cs++ = reg; + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + } + intel_ring_advance(rq, cs); + +err_req: + i915_request_add(rq); + + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -EIO; + + return err; +} + +static int scrub_whitelisted_registers(struct i915_gem_context *ctx, + struct intel_engine_cs *engine) +{ + struct i915_request *rq; + struct i915_vma *batch; + int i, err = 0; + u32 *cs; + + batch = create_batch(ctx); + if (IS_ERR(batch)) + return PTR_ERR(batch); + + cs = i915_gem_object_pin_map(batch->obj, I915_MAP_WC); + if (IS_ERR(cs)) { + err = PTR_ERR(cs); + goto err_batch; + } + + *cs++ = MI_LOAD_REGISTER_IMM(whitelist_writable_count(engine)); + for (i = 0; i < engine->whitelist.count; i++) { + u32 reg = i915_mmio_reg_offset(engine->whitelist.list[i].reg); + + if (ro_register(reg)) + continue; + + *cs++ = reg; + *cs++ = 0xffffffff; + } + *cs++ = MI_BATCH_BUFFER_END; + + i915_gem_object_flush_map(batch->obj); + i915_gem_chipset_flush(ctx->i915); + + rq = igt_request_alloc(ctx, engine); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto err_unpin; + } + + if (engine->emit_init_breadcrumb) { /* Be nice if we hang */ + err = engine->emit_init_breadcrumb(rq); + if (err) + goto err_request; + } + + /* Perform the writes from an unprivileged "user" batch */ + err = engine->emit_bb_start(rq, batch->node.start, 0, 0); + +err_request: + i915_request_add(rq); + if (i915_request_wait(rq, 0, HZ / 5) < 0) + err = -EIO; + +err_unpin: + i915_gem_object_unpin_map(batch->obj); +err_batch: + i915_vma_unpin_and_release(&batch, 0); + return err; +} + +struct regmask { + i915_reg_t reg; + unsigned long gen_mask; +}; + +static bool find_reg(struct drm_i915_private *i915, + i915_reg_t reg, + const struct regmask *tbl, + unsigned long count) +{ + u32 offset = i915_mmio_reg_offset(reg); + + while (count--) { + if (INTEL_INFO(i915)->gen_mask & tbl->gen_mask && + i915_mmio_reg_offset(tbl->reg) == offset) + return true; + tbl++; + } + + return false; +} + +static bool pardon_reg(struct drm_i915_private *i915, i915_reg_t reg) +{ + /* Alas, we must pardon some whitelists. Mistakes already made */ + static const struct regmask pardon[] = { + { GEN9_CTX_PREEMPT_REG, INTEL_GEN_MASK(9, 9) }, + { GEN8_L3SQCREG4, INTEL_GEN_MASK(9, 9) }, + }; + + return find_reg(i915, reg, pardon, ARRAY_SIZE(pardon)); +} + +static bool result_eq(struct intel_engine_cs *engine, + u32 a, u32 b, i915_reg_t reg) +{ + if (a != b && !pardon_reg(engine->i915, reg)) { + pr_err("Whitelisted register 0x%4x not context saved: A=%08x, B=%08x\n", + i915_mmio_reg_offset(reg), a, b); + return false; + } + + return true; +} + +static bool writeonly_reg(struct drm_i915_private *i915, i915_reg_t reg) +{ + /* Some registers do not seem to behave and our writes unreadable */ + static const struct regmask wo[] = { + { GEN9_SLICE_COMMON_ECO_CHICKEN1, INTEL_GEN_MASK(9, 9) }, + }; + + return find_reg(i915, reg, wo, ARRAY_SIZE(wo)); +} + +static bool result_neq(struct intel_engine_cs *engine, + u32 a, u32 b, i915_reg_t reg) +{ + if (a == b && !writeonly_reg(engine->i915, reg)) { + pr_err("Whitelist register 0x%4x:%08x was unwritable\n", + i915_mmio_reg_offset(reg), a); + return false; + } + + return true; +} + +static int +check_whitelisted_registers(struct intel_engine_cs *engine, + struct i915_vma *A, + struct i915_vma *B, + bool (*fn)(struct intel_engine_cs *engine, + u32 a, u32 b, + i915_reg_t reg)) +{ + u32 *a, *b; + int i, err; + + a = i915_gem_object_pin_map(A->obj, I915_MAP_WB); + if (IS_ERR(a)) + return PTR_ERR(a); + + b = i915_gem_object_pin_map(B->obj, I915_MAP_WB); + if (IS_ERR(b)) { + err = PTR_ERR(b); + goto err_a; + } + + err = 0; + for (i = 0; i < engine->whitelist.count; i++) { + if (!fn(engine, a[i], b[i], engine->whitelist.list[i].reg)) + err = -EINVAL; + } + + i915_gem_object_unpin_map(B->obj); +err_a: + i915_gem_object_unpin_map(A->obj); + return err; +} + +static int live_isolated_whitelist(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct { + struct i915_gem_context *ctx; + struct i915_vma *scratch[2]; + } client[2] = {}; + struct intel_engine_cs *engine; + enum intel_engine_id id; + int i, err = 0; + + /* + * Check that a write into a whitelist register works, but + * invisible to a second context. + */ + + if (!intel_engines_has_context_isolation(i915)) + return 0; + + if (!i915->kernel_context->vm) + return 0; + + for (i = 0; i < ARRAY_SIZE(client); i++) { + struct i915_gem_context *c; + + c = kernel_context(i915); + if (IS_ERR(c)) { + err = PTR_ERR(c); + goto err; + } + + client[i].scratch[0] = create_scratch(c->vm, 1024); + if (IS_ERR(client[i].scratch[0])) { + err = PTR_ERR(client[i].scratch[0]); + kernel_context_close(c); + goto err; + } + + client[i].scratch[1] = create_scratch(c->vm, 1024); + if (IS_ERR(client[i].scratch[1])) { + err = PTR_ERR(client[i].scratch[1]); + i915_vma_unpin_and_release(&client[i].scratch[0], 0); + kernel_context_close(c); + goto err; + } + + client[i].ctx = c; + } + + for_each_engine(engine, i915, id) { + if (!whitelist_writable_count(engine)) + continue; + + /* Read default values */ + err = read_whitelisted_registers(client[0].ctx, engine, + client[0].scratch[0]); + if (err) + goto err; + + /* Try to overwrite registers (should only affect ctx0) */ + err = scrub_whitelisted_registers(client[0].ctx, engine); + if (err) + goto err; + + /* Read values from ctx1, we expect these to be defaults */ + err = read_whitelisted_registers(client[1].ctx, engine, + client[1].scratch[0]); + if (err) + goto err; + + /* Verify that both reads return the same default values */ + err = check_whitelisted_registers(engine, + client[0].scratch[0], + client[1].scratch[0], + result_eq); + if (err) + goto err; + + /* Read back the updated values in ctx0 */ + err = read_whitelisted_registers(client[0].ctx, engine, + client[0].scratch[1]); + if (err) + goto err; + + /* User should be granted privilege to overwhite regs */ + err = check_whitelisted_registers(engine, + client[0].scratch[0], + client[0].scratch[1], + result_neq); + if (err) + goto err; + } + +err: + for (i = 0; i < ARRAY_SIZE(client); i++) { + if (!client[i].ctx) + break; + + i915_vma_unpin_and_release(&client[i].scratch[1], 0); + i915_vma_unpin_and_release(&client[i].scratch[0], 0); + kernel_context_close(client[i].ctx); + } + + if (igt_flush_test(i915, I915_WAIT_LOCKED)) + err = -EIO; + + return err; +} + +static bool +verify_wa_lists(struct i915_gem_context *ctx, struct wa_lists *lists, + const char *str) +{ + struct drm_i915_private *i915 = ctx->i915; + struct i915_gem_engines_iter it; + struct intel_context *ce; + bool ok = true; + + ok &= wa_list_verify(&i915->uncore, &lists->gt_wa_list, str); + + for_each_gem_engine(ce, i915_gem_context_lock_engines(ctx), it) { + enum intel_engine_id id = ce->engine->id; + + ok &= engine_wa_list_verify(ce, + &lists->engine[id].wa_list, + str) == 0; + + ok &= engine_wa_list_verify(ce, + &lists->engine[id].ctx_wa_list, + str) == 0; + } + i915_gem_context_unlock_engines(ctx); + + return ok; +} + +static int +live_gpu_reset_workarounds(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct i915_gem_context *ctx; + intel_wakeref_t wakeref; + struct wa_lists lists; + bool ok; + + if (!intel_has_gpu_reset(i915)) + return 0; + + ctx = kernel_context(i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + pr_info("Verifying after GPU reset...\n"); + + igt_global_reset_lock(i915); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + reference_lists_init(i915, &lists); + + ok = verify_wa_lists(ctx, &lists, "before reset"); + if (!ok) + goto out; + + i915_reset(i915, ALL_ENGINES, "live_workarounds"); + + ok = verify_wa_lists(ctx, &lists, "after reset"); + +out: + kernel_context_close(ctx); + reference_lists_fini(i915, &lists); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + igt_global_reset_unlock(i915); + + return ok ? 0 : -ESRCH; +} + +static int +live_engine_reset_workarounds(void *arg) +{ + struct drm_i915_private *i915 = arg; + struct intel_engine_cs *engine; + struct i915_gem_context *ctx; + struct igt_spinner spin; + enum intel_engine_id id; + struct i915_request *rq; + intel_wakeref_t wakeref; + struct wa_lists lists; + int ret = 0; + + if (!intel_has_reset_engine(i915)) + return 0; + + ctx = kernel_context(i915); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + igt_global_reset_lock(i915); + wakeref = intel_runtime_pm_get(&i915->runtime_pm); + + reference_lists_init(i915, &lists); + + for_each_engine(engine, i915, id) { + bool ok; + + pr_info("Verifying after %s reset...\n", engine->name); + + ok = verify_wa_lists(ctx, &lists, "before reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + + i915_reset_engine(engine, "live_workarounds"); + + ok = verify_wa_lists(ctx, &lists, "after idle reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + + ret = igt_spinner_init(&spin, i915); + if (ret) + goto err; + + rq = igt_spinner_create_request(&spin, ctx, engine, MI_NOOP); + if (IS_ERR(rq)) { + ret = PTR_ERR(rq); + igt_spinner_fini(&spin); + goto err; + } + + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + pr_err("Spinner failed to start\n"); + igt_spinner_fini(&spin); + ret = -ETIMEDOUT; + goto err; + } + + i915_reset_engine(engine, "live_workarounds"); + + igt_spinner_end(&spin); + igt_spinner_fini(&spin); + + ok = verify_wa_lists(ctx, &lists, "after busy reset"); + if (!ok) { + ret = -ESRCH; + goto err; + } + } + +err: + reference_lists_fini(i915, &lists); + intel_runtime_pm_put(&i915->runtime_pm, wakeref); + igt_global_reset_unlock(i915); + kernel_context_close(ctx); + + igt_flush_test(i915, I915_WAIT_LOCKED); + + return ret; +} + +int intel_workarounds_live_selftests(struct drm_i915_private *i915) +{ + static const struct i915_subtest tests[] = { + SUBTEST(live_dirty_whitelist), + SUBTEST(live_reset_whitelist), + SUBTEST(live_isolated_whitelist), + SUBTEST(live_gpu_reset_workarounds), + SUBTEST(live_engine_reset_workarounds), + }; + int err; + + if (i915_terminally_wedged(i915)) + return 0; + + mutex_lock(&i915->drm.struct_mutex); + err = i915_subtests(tests, i915); + mutex_unlock(&i915->drm.struct_mutex); + + return err; +} |