diff options
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_lrc.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_lrc.c | 1079 |
1 files changed, 780 insertions, 299 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 2dfaddb8811e..87e6c5bdd2dc 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -147,6 +147,7 @@ #include "intel_reset.h" #include "intel_ring.h" #include "intel_workarounds.h" +#include "shmem_utils.h" #define RING_EXECLIST_QFULL (1 << 0x2) #define RING_EXECLIST1_VALID (1 << 0x3) @@ -216,7 +217,7 @@ struct virtual_engine { /* And finally, which physical engines this virtual engine maps onto. */ unsigned int num_siblings; - struct intel_engine_cs *siblings[0]; + struct intel_engine_cs *siblings[]; }; static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine) @@ -238,6 +239,123 @@ __execlists_update_reg_state(const struct intel_context *ce, const struct intel_engine_cs *engine, u32 head); +static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x60; + else if (INTEL_GEN(engine->i915) >= 9) + return 0x54; + else if (engine->class == RENDER_CLASS) + return 0x58; + else + return -1; +} + +static int lrc_ring_gpr0(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x74; + else if (INTEL_GEN(engine->i915) >= 9) + return 0x68; + else if (engine->class == RENDER_CLASS) + return 0xd8; + else + return -1; +} + +static int lrc_ring_wa_bb_per_ctx(const struct intel_engine_cs *engine) +{ + if (INTEL_GEN(engine->i915) >= 12) + return 0x12; + else if (INTEL_GEN(engine->i915) >= 9 || engine->class == RENDER_CLASS) + return 0x18; + else + return -1; +} + +static int lrc_ring_indirect_ptr(const struct intel_engine_cs *engine) +{ + int x; + + x = lrc_ring_wa_bb_per_ctx(engine); + if (x < 0) + return x; + + return x + 2; +} + +static int lrc_ring_indirect_offset(const struct intel_engine_cs *engine) +{ + int x; + + x = lrc_ring_indirect_ptr(engine); + if (x < 0) + return x; + + return x + 2; +} + +static int lrc_ring_cmd_buf_cctl(const struct intel_engine_cs *engine) +{ + if (engine->class != RENDER_CLASS) + return -1; + + if (INTEL_GEN(engine->i915) >= 12) + return 0xb6; + else if (INTEL_GEN(engine->i915) >= 11) + return 0xaa; + else + return -1; +} + +static u32 +lrc_ring_indirect_offset_default(const struct intel_engine_cs *engine) +{ + switch (INTEL_GEN(engine->i915)) { + default: + MISSING_CASE(INTEL_GEN(engine->i915)); + fallthrough; + case 12: + return GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 11: + return GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 10: + return GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 9: + return GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + case 8: + return GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; + } +} + +static void +lrc_ring_setup_indirect_ctx(u32 *regs, + const struct intel_engine_cs *engine, + u32 ctx_bb_ggtt_addr, + u32 size) +{ + GEM_BUG_ON(!size); + GEM_BUG_ON(!IS_ALIGNED(size, CACHELINE_BYTES)); + GEM_BUG_ON(lrc_ring_indirect_ptr(engine) == -1); + regs[lrc_ring_indirect_ptr(engine) + 1] = + ctx_bb_ggtt_addr | (size / CACHELINE_BYTES); + + GEM_BUG_ON(lrc_ring_indirect_offset(engine) == -1); + regs[lrc_ring_indirect_offset(engine) + 1] = + lrc_ring_indirect_offset_default(engine) << 6; +} + +static u32 intel_context_get_runtime(const struct intel_context *ce) +{ + /* + * We can use either ppHWSP[16] which is recorded before the context + * switch (and so excludes the cost of context switches) or use the + * value from the context image itself, which is saved/restored earlier + * and so includes the cost of the save. + */ + return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); +} + static void mark_eio(struct i915_request *rq) { if (i915_request_completed(rq)) @@ -311,18 +429,7 @@ static int effective_prio(const struct i915_request *rq) if (i915_request_has_nopreempt(rq)) prio = I915_PRIORITY_UNPREEMPTABLE; - /* - * On unwinding the active request, we give it a priority bump - * if it has completed waiting on any semaphore. If we know that - * the request has already started, we can prevent an unwanted - * preempt-to-idle cycle by taking that into account now. - */ - if (__i915_request_has_started(rq)) - prio |= I915_PRIORITY_NOSEMAPHORE; - - /* Restrict mere WAIT boosts from triggering preemption */ - BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */ - return prio | __NO_PREEMPTION; + return prio; } static int queue_prio(const struct intel_engine_execlists *execlists) @@ -489,7 +596,7 @@ static void set_offsets(u32 *regs, #define REG16(x) \ (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \ (((x) >> 2) & 0x7f) -#define END(x) 0, (x) +#define END(total_state_size) 0, (total_state_size) { const u32 base = engine->mmio_base; @@ -512,7 +619,7 @@ static void set_offsets(u32 *regs, if (flags & POSTED) *regs |= MI_LRI_FORCE_POSTED; if (INTEL_GEN(engine->i915) >= 11) - *regs |= MI_LRI_CS_MMIO; + *regs |= MI_LRI_LRM_CS_MMIO; regs++; GEM_BUG_ON(!count); @@ -897,8 +1004,63 @@ static const u8 gen12_rcs_offsets[] = { NOP(6), LRI(1, 0), REG(0x0c8), + NOP(3 + 9 + 1), + + LRI(51, POSTED), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG16(0x588), + REG(0x028), + REG(0x09c), + REG(0x0c0), + REG(0x178), + REG(0x17c), + REG16(0x358), + REG(0x170), + REG(0x150), + REG(0x154), + REG(0x158), + REG16(0x41c), + REG16(0x600), + REG16(0x604), + REG16(0x608), + REG16(0x60c), + REG16(0x610), + REG16(0x614), + REG16(0x618), + REG16(0x61c), + REG16(0x620), + REG16(0x624), + REG16(0x628), + REG16(0x62c), + REG16(0x630), + REG16(0x634), + REG16(0x638), + REG16(0x63c), + REG16(0x640), + REG16(0x644), + REG16(0x648), + REG16(0x64c), + REG16(0x650), + REG16(0x654), + REG16(0x658), + REG16(0x65c), + REG16(0x660), + REG16(0x664), + REG16(0x668), + REG16(0x66c), + REG16(0x670), + REG16(0x674), + REG16(0x678), + REG16(0x67c), + REG(0x068), + REG(0x084), + NOP(1), - END(80) + END(192) }; #undef END @@ -1026,17 +1188,14 @@ static void intel_engine_context_in(struct intel_engine_cs *engine) { unsigned long flags; - if (READ_ONCE(engine->stats.enabled) == 0) + if (atomic_add_unless(&engine->stats.active, 1, 0)) return; write_seqlock_irqsave(&engine->stats.lock, flags); - - if (engine->stats.enabled > 0) { - if (engine->stats.active++ == 0) - engine->stats.start = ktime_get(); - GEM_BUG_ON(engine->stats.active == 0); + if (!atomic_add_unless(&engine->stats.active, 1, 0)) { + engine->stats.start = ktime_get(); + atomic_inc(&engine->stats.active); } - write_sequnlock_irqrestore(&engine->stats.lock, flags); } @@ -1044,51 +1203,20 @@ static void intel_engine_context_out(struct intel_engine_cs *engine) { unsigned long flags; - if (READ_ONCE(engine->stats.enabled) == 0) + GEM_BUG_ON(!atomic_read(&engine->stats.active)); + + if (atomic_add_unless(&engine->stats.active, -1, 1)) return; write_seqlock_irqsave(&engine->stats.lock, flags); - - if (engine->stats.enabled > 0) { - ktime_t last; - - if (engine->stats.active && --engine->stats.active == 0) { - /* - * Decrement the active context count and in case GPU - * is now idle add up to the running total. - */ - last = ktime_sub(ktime_get(), engine->stats.start); - - engine->stats.total = ktime_add(engine->stats.total, - last); - } else if (engine->stats.active == 0) { - /* - * After turning on engine stats, context out might be - * the first event in which case we account from the - * time stats gathering was turned on. - */ - last = ktime_sub(ktime_get(), engine->stats.enabled_at); - - engine->stats.total = ktime_add(engine->stats.total, - last); - } + if (atomic_dec_and_test(&engine->stats.active)) { + engine->stats.total = + ktime_add(engine->stats.total, + ktime_sub(ktime_get(), engine->stats.start)); } - write_sequnlock_irqrestore(&engine->stats.lock, flags); } -static int lrc_ring_mi_mode(const struct intel_engine_cs *engine) -{ - if (INTEL_GEN(engine->i915) >= 12) - return 0x60; - else if (INTEL_GEN(engine->i915) >= 9) - return 0x54; - else if (engine->class == RENDER_CLASS) - return 0x58; - else - return -1; -} - static void execlists_check_context(const struct intel_context *ce, const struct intel_engine_cs *engine) @@ -1132,14 +1260,12 @@ execlists_check_context(const struct intel_context *ce, static void restore_default_state(struct intel_context *ce, struct intel_engine_cs *engine) { - u32 *regs = ce->lrc_reg_state; + u32 *regs; - if (engine->pinned_default_state) - memcpy(regs, /* skip restoring the vanilla PPHWSP */ - engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE, - engine->context_size - PAGE_SIZE); + regs = memset(ce->lrc_reg_state, 0, engine->context_size - PAGE_SIZE); + execlists_init_reg_state(regs, ce, engine, ce->ring, true); - execlists_init_reg_state(regs, ce, engine, ce->ring, false); + ce->runtime.last = intel_context_get_runtime(ce); } static void reset_active(struct i915_request *rq, @@ -1181,17 +1307,6 @@ static void reset_active(struct i915_request *rq, ce->lrc.desc |= CTX_DESC_FORCE_RESTORE; } -static u32 intel_context_get_runtime(const struct intel_context *ce) -{ - /* - * We can use either ppHWSP[16] which is recorded before the context - * switch (and so excludes the cost of context switches) or use the - * value from the context image itself, which is saved/restored earlier - * and so includes the cost of the save. - */ - return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]); -} - static void st_update_runtime_underflow(struct intel_context *ce, s32 dt) { #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) @@ -1243,7 +1358,7 @@ __execlists_schedule_in(struct i915_request *rq) ce->lrc.ccid = ce->tag; } else { /* We don't need a strict matching tag, just different values */ - unsigned int tag = ffs(engine->context_tag); + unsigned int tag = ffs(READ_ONCE(engine->context_tag)); GEM_BUG_ON(tag == 0 || tag >= BITS_PER_LONG); clear_bit(tag - 1, &engine->context_tag); @@ -1417,6 +1532,24 @@ static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc } } +static __maybe_unused char * +dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq) +{ + if (!rq) + return ""; + + snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d", + prefix, + rq->context->lrc.ccid, + rq->fence.context, rq->fence.seqno, + i915_request_completed(rq) ? "!" : + i915_request_started(rq) ? "*" : + "", + rq_prio(rq)); + + return buf; +} + static __maybe_unused void trace_ports(const struct intel_engine_execlists *execlists, const char *msg, @@ -1424,18 +1557,14 @@ trace_ports(const struct intel_engine_execlists *execlists, { const struct intel_engine_cs *engine = container_of(execlists, typeof(*engine), execlists); + char __maybe_unused p0[40], p1[40]; if (!ports[0]) return; - ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg, - ports[0]->fence.context, - ports[0]->fence.seqno, - i915_request_completed(ports[0]) ? "!" : - i915_request_started(ports[0]) ? "*" : - "", - ports[1] ? ports[1]->fence.context : 0, - ports[1] ? ports[1]->fence.seqno : 0); + ENGINE_TRACE(engine, "%s { %s%s }\n", msg, + dump_port(p0, sizeof(p0), "", ports[0]), + dump_port(p1, sizeof(p1), ", ", ports[1])); } static inline bool @@ -1448,9 +1577,12 @@ static __maybe_unused bool assert_pending_valid(const struct intel_engine_execlists *execlists, const char *msg) { + struct intel_engine_cs *engine = + container_of(execlists, typeof(*engine), execlists); struct i915_request * const *port, *rq; struct intel_context *ce = NULL; bool sentinel = false; + u32 ccid = -1; trace_ports(execlists, msg, execlists->pending); @@ -1459,13 +1591,14 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, return true; if (!execlists->pending[0]) { - GEM_TRACE_ERR("Nothing pending for promotion!\n"); + GEM_TRACE_ERR("%s: Nothing pending for promotion!\n", + engine->name); return false; } if (execlists->pending[execlists_num_ports(execlists)]) { - GEM_TRACE_ERR("Excess pending[%d] for promotion!\n", - execlists_num_ports(execlists)); + GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n", + engine->name, execlists_num_ports(execlists)); return false; } @@ -1477,20 +1610,31 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, GEM_BUG_ON(!i915_request_is_active(rq)); if (ce == rq->context) { - GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n", + GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); return false; } ce = rq->context; + if (ccid == ce->lrc.ccid) { + GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n", + engine->name, + ccid, ce->timeline->fence_context, + port - execlists->pending); + return false; + } + ccid = ce->lrc.ccid; + /* * Sentinels are supposed to be lonely so they flush the * current exection off the HW. Check that they are the * only request in the pending submission. */ if (sentinel) { - GEM_TRACE_ERR("context:%llx after sentinel in pending[%zd]\n", + GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); return false; @@ -1498,7 +1642,8 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, sentinel = i915_request_has_sentinel(rq); if (sentinel && port != execlists->pending) { - GEM_TRACE_ERR("sentinel context:%llx not in prime position[%zd]\n", + GEM_TRACE_ERR("%s: sentinel context:%llx not in prime position[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); return false; @@ -1513,7 +1658,8 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, if (i915_active_is_idle(&ce->active) && !intel_context_is_barrier(ce)) { - GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n", + GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); ok = false; @@ -1521,7 +1667,8 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, } if (!i915_vma_is_pinned(ce->state)) { - GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n", + GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); ok = false; @@ -1529,7 +1676,8 @@ assert_pending_valid(const struct intel_engine_execlists *execlists, } if (!i915_vma_is_pinned(ce->ring->vma)) { - GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n", + GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n", + engine->name, ce->timeline->fence_context, port - execlists->pending); ok = false; @@ -1664,30 +1812,16 @@ static bool virtual_matches(const struct virtual_engine *ve, return true; } -static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, - struct i915_request *rq) +static void virtual_xfer_breadcrumbs(struct virtual_engine *ve) { - struct intel_engine_cs *old = ve->siblings[0]; - - /* All unattached (rq->engine == old) must already be completed */ - - spin_lock(&old->breadcrumbs.irq_lock); - if (!list_empty(&ve->context.signal_link)) { - list_del_init(&ve->context.signal_link); - - /* - * We cannot acquire the new engine->breadcrumbs.irq_lock - * (as we are holding a breadcrumbs.irq_lock already), - * so attach this request to the signaler on submission. - * The queued irq_work will occur when we finally drop - * the engine->active.lock after dequeue. - */ - set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags); - - /* Also transfer the pending irq_work for the old breadcrumb. */ - intel_engine_signal_breadcrumbs(rq->engine); - } - spin_unlock(&old->breadcrumbs.irq_lock); + /* + * All the outstanding signals on ve->siblings[0] must have + * been completed, just pending the interrupt handler. As those + * signals still refer to the old sibling (via rq->engine), we must + * transfer those to the old irq_worker to keep our locking + * consistent. + */ + intel_engine_transfer_stale_breadcrumbs(ve->siblings[0], &ve->context); } #define for_each_waiter(p__, rq__) \ @@ -1729,7 +1863,8 @@ static void defer_request(struct i915_request *rq, struct list_head * const pl) continue; /* No waiter should start before its signaler */ - GEM_BUG_ON(i915_request_started(w) && + GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) && + i915_request_started(w) && !i915_request_completed(rq)); GEM_BUG_ON(i915_request_is_active(w)); @@ -1831,16 +1966,25 @@ static unsigned long active_timeslice(const struct intel_engine_cs *engine) static void set_timeslice(struct intel_engine_cs *engine) { + unsigned long duration; + if (!intel_engine_has_timeslices(engine)) return; - set_timer_ms(&engine->execlists.timer, active_timeslice(engine)); + duration = active_timeslice(engine); + ENGINE_TRACE(engine, "bump timeslicing, interval:%lu", duration); + + set_timer_ms(&engine->execlists.timer, duration); } static void start_timeslice(struct intel_engine_cs *engine) { struct intel_engine_execlists *execlists = &engine->execlists; - int prio = queue_prio(execlists); + const int prio = queue_prio(execlists); + unsigned long duration; + + if (!intel_engine_has_timeslices(engine)) + return; WRITE_ONCE(execlists->switch_priority_hint, prio); if (prio == INT_MIN) @@ -1849,7 +1993,12 @@ static void start_timeslice(struct intel_engine_cs *engine) if (timer_pending(&execlists->timer)) return; - set_timer_ms(&execlists->timer, timeslice(engine)); + duration = timeslice(engine); + ENGINE_TRACE(engine, + "start timeslicing, prio:%d, interval:%lu", + prio, duration); + + set_timer_ms(&execlists->timer, duration); } static void record_preemption(struct intel_engine_execlists *execlists) @@ -1946,11 +2095,26 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * of trouble. */ active = READ_ONCE(execlists->active); - while ((last = *active) && i915_request_completed(last)) - active++; - if (last) { + /* + * In theory we can skip over completed contexts that have not + * yet been processed by events (as those events are in flight): + * + * while ((last = *active) && i915_request_completed(last)) + * active++; + * + * However, the GPU cannot handle this as it will ultimately + * find itself trying to jump back into a context it has just + * completed and barf. + */ + + if ((last = *active)) { if (need_preempt(engine, last, rb)) { + if (i915_request_completed(last)) { + tasklet_hi_schedule(&execlists->tasklet); + return; + } + ENGINE_TRACE(engine, "preempting last=%llx:%lld, prio=%d, hint=%d\n", last->fence.context, @@ -1978,6 +2142,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine) last = NULL; } else if (need_timeslice(engine, last) && timeslice_expired(execlists, last)) { + if (i915_request_completed(last)) { + tasklet_hi_schedule(&execlists->tasklet); + return; + } + ENGINE_TRACE(engine, "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n", last->fence.context, @@ -2087,7 +2256,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) engine); if (!list_empty(&ve->context.signals)) - virtual_xfer_breadcrumbs(ve, rq); + virtual_xfer_breadcrumbs(ve); /* * Move the bound engine to the top of the list @@ -2246,8 +2415,8 @@ done: clear_ports(port + 1, last_port - port); WRITE_ONCE(execlists->yield, -1); - execlists_submit_ports(engine); set_preempt_timeout(engine, *active); + execlists_submit_ports(engine); } else { skip_submit: ring_set_paused(engine, 0); @@ -2417,8 +2586,6 @@ static void process_csb(struct intel_engine_cs *engine) if (promote) { struct i915_request * const *old = execlists->active; - GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); - ring_set_paused(engine, 0); /* Point active to the new ELSP; prevent overwriting */ @@ -2431,6 +2598,7 @@ static void process_csb(struct intel_engine_cs *engine) execlists_schedule_out(*old++); /* switch pending to inflight */ + GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); memcpy(execlists->inflight, execlists->pending, execlists_num_ports(execlists) * @@ -2449,17 +2617,21 @@ static void process_csb(struct intel_engine_cs *engine) * We rely on the hardware being strongly * ordered, that the breadcrumb write is * coherent (visible from the CPU) before the - * user interrupt and CSB is processed. + * user interrupt is processed. One might assume + * that the breadcrumb write being before the + * user interrupt and the CS event for the context + * switch would therefore be before the CS event + * itself... */ if (GEM_SHOW_DEBUG() && - !i915_request_completed(*execlists->active) && - !reset_in_progress(execlists)) { - struct i915_request *rq __maybe_unused = - *execlists->active; + !i915_request_completed(*execlists->active)) { + struct i915_request *rq = *execlists->active; const u32 *regs __maybe_unused = rq->context->lrc_reg_state; ENGINE_TRACE(engine, + "context completed before request!\n"); + ENGINE_TRACE(engine, "ring:{start:0x%08x, head:%04x, tail:%04x, ctl:%08x, mode:%08x}\n", ENGINE_READ(engine, RING_START), ENGINE_READ(engine, RING_HEAD) & HEAD_ADDR, @@ -2478,8 +2650,6 @@ static void process_csb(struct intel_engine_cs *engine) regs[CTX_RING_START], regs[CTX_RING_HEAD], regs[CTX_RING_TAIL]); - - GEM_BUG_ON("context completed before request"); } execlists_schedule_out(*execlists->active++); @@ -2769,6 +2939,45 @@ err_cap: return NULL; } +static struct i915_request * +active_context(struct intel_engine_cs *engine, u32 ccid) +{ + const struct intel_engine_execlists * const el = &engine->execlists; + struct i915_request * const *port, *rq; + + /* + * Use the most recent result from process_csb(), but just in case + * we trigger an error (via interrupt) before the first CS event has + * been written, peek at the next submission. + */ + + for (port = el->active; (rq = *port); port++) { + if (rq->context->lrc.ccid == ccid) { + ENGINE_TRACE(engine, + "ccid found at active:%zd\n", + port - el->active); + return rq; + } + } + + for (port = el->pending; (rq = *port); port++) { + if (rq->context->lrc.ccid == ccid) { + ENGINE_TRACE(engine, + "ccid found at pending:%zd\n", + port - el->pending); + return rq; + } + } + + ENGINE_TRACE(engine, "ccid:%x not found\n", ccid); + return NULL; +} + +static u32 active_ccid(struct intel_engine_cs *engine) +{ + return ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI); +} + static bool execlists_capture(struct intel_engine_cs *engine) { struct execlists_capture *cap; @@ -2786,7 +2995,7 @@ static bool execlists_capture(struct intel_engine_cs *engine) return true; spin_lock_irq(&engine->active.lock); - cap->rq = execlists_active(&engine->execlists); + cap->rq = active_context(engine, active_ccid(engine)); if (cap->rq) { cap->rq = active_request(cap->rq->context->timeline, cap->rq); cap->rq = i915_request_get_rcu(cap->rq); @@ -2934,10 +3143,14 @@ static void __submit_queue_imm(struct intel_engine_cs *engine) if (reset_in_progress(execlists)) return; /* defer until we restart the engine following reset */ - if (execlists->tasklet.func == execlists_submission_tasklet) - __execlists_submission_tasklet(engine); - else - tasklet_hi_schedule(&execlists->tasklet); + /* Hopefully we clear execlists->pending[] to let us through */ + if (READ_ONCE(execlists->pending[0]) && + tasklet_trylock(&execlists->tasklet)) { + process_csb(engine); + tasklet_unlock(&execlists->tasklet); + } + + __execlists_submission_tasklet(engine); } static void submit_queue(struct intel_engine_cs *engine, @@ -3023,19 +3236,139 @@ check_redzone(const void *vaddr, const struct intel_engine_cs *engine) vaddr += engine->context_size; if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE)) - dev_err_once(engine->i915->drm.dev, + drm_err_once(&engine->i915->drm, "%s context redzone overwritten!\n", engine->name); } static void execlists_context_unpin(struct intel_context *ce) { - check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE, + check_redzone((void *)ce->lrc_reg_state - LRC_STATE_OFFSET, ce->engine); i915_gem_object_unpin_map(ce->state->obj); } +static u32 * +gen12_emit_timestamp_wa(const struct intel_context *ce, u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + CTX_TIMESTAMP * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(0)); + + return cs; +} + +static u32 * +gen12_emit_restore_scratch(const struct intel_context *ce, u32 *cs) +{ + GEM_BUG_ON(lrc_ring_gpr0(ce->engine) == -1); + + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + (lrc_ring_gpr0(ce->engine) + 1) * sizeof(u32); + *cs++ = 0; + + return cs; +} + +static u32 * +gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) +{ + GEM_BUG_ON(lrc_ring_cmd_buf_cctl(ce->engine) == -1); + + *cs++ = MI_LOAD_REGISTER_MEM_GEN8 | + MI_SRM_LRM_GLOBAL_GTT | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_ggtt_offset(ce->state) + LRC_STATE_OFFSET + + (lrc_ring_cmd_buf_cctl(ce->engine) + 1) * sizeof(u32); + *cs++ = 0; + + *cs++ = MI_LOAD_REGISTER_REG | + MI_LRR_SOURCE_CS_MMIO | + MI_LRI_LRM_CS_MMIO; + *cs++ = i915_mmio_reg_offset(GEN8_RING_CS_GPR(0, 0)); + *cs++ = i915_mmio_reg_offset(RING_CMD_BUF_CCTL(0)); + + return cs; +} + +static u32 * +gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) +{ + cs = gen12_emit_timestamp_wa(ce, cs); + cs = gen12_emit_cmd_buf_wa(ce, cs); + cs = gen12_emit_restore_scratch(ce, cs); + + return cs; +} + +static u32 * +gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) +{ + cs = gen12_emit_timestamp_wa(ce, cs); + cs = gen12_emit_restore_scratch(ce, cs); + + return cs; +} + +static inline u32 context_wa_bb_offset(const struct intel_context *ce) +{ + return PAGE_SIZE * ce->wa_bb_page; +} + +static u32 *context_indirect_bb(const struct intel_context *ce) +{ + void *ptr; + + GEM_BUG_ON(!ce->wa_bb_page); + + ptr = ce->lrc_reg_state; + ptr -= LRC_STATE_OFFSET; /* back to start of context image */ + ptr += context_wa_bb_offset(ce); + + return ptr; +} + +static void +setup_indirect_ctx_bb(const struct intel_context *ce, + const struct intel_engine_cs *engine, + u32 *(*emit)(const struct intel_context *, u32 *)) +{ + u32 * const start = context_indirect_bb(ce); + u32 *cs; + + cs = emit(ce, start); + GEM_BUG_ON(cs - start > I915_GTT_PAGE_SIZE / sizeof(*cs)); + while ((unsigned long)cs % CACHELINE_BYTES) + *cs++ = MI_NOOP; + + lrc_ring_setup_indirect_ctx(ce->lrc_reg_state, engine, + i915_ggtt_offset(ce->state) + + context_wa_bb_offset(ce), + (cs - start) * sizeof(*cs)); +} + static void __execlists_update_reg_state(const struct intel_context *ce, const struct intel_engine_cs *engine, @@ -3059,6 +3392,18 @@ __execlists_update_reg_state(const struct intel_context *ce, i915_oa_init_reg_state(ce, engine); } + + if (ce->wa_bb_page) { + u32 *(*fn)(const struct intel_context *ce, u32 *cs); + + fn = gen12_emit_indirect_ctx_xcs; + if (ce->engine->class == RENDER_CLASS) + fn = gen12_emit_indirect_ctx_rcs; + + /* Mutually exclusive wrt to global indirect bb */ + GEM_BUG_ON(engine->wa_ctx.indirect_ctx.size); + setup_indirect_ctx_bb(ce, engine, fn); + } } static int @@ -3077,7 +3422,7 @@ __execlists_context_pin(struct intel_context *ce, return PTR_ERR(vaddr); ce->lrc.lrca = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE; - ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE; + ce->lrc_reg_state = vaddr + LRC_STATE_OFFSET; __execlists_update_reg_state(ce, engine, ce->ring->tail); return 0; @@ -3125,6 +3470,7 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) { u32 *cs; + GEM_BUG_ON(i915_request_has_initial_breadcrumb(rq)); if (!i915_request_timeline(rq)->has_initial_breadcrumb) return 0; @@ -3151,6 +3497,56 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq) /* Record the updated position of the request's payload */ rq->infix = intel_ring_offset(rq, cs); + __set_bit(I915_FENCE_FLAG_INITIAL_BREADCRUMB, &rq->fence.flags); + + return 0; +} + +static int emit_pdps(struct i915_request *rq) +{ + const struct intel_engine_cs * const engine = rq->engine; + struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(rq->context->vm); + int err, i; + u32 *cs; + + GEM_BUG_ON(intel_vgpu_active(rq->i915)); + + /* + * Beware ye of the dragons, this sequence is magic! + * + * Small changes to this sequence can cause anything from + * GPU hangs to forcewake errors and machine lockups! + */ + + /* Flush any residual operations from the context load */ + err = engine->emit_flush(rq, EMIT_FLUSH); + if (err) + return err; + + /* Magic required to prevent forcewake errors! */ + err = engine->emit_flush(rq, EMIT_INVALIDATE); + if (err) + return err; + + cs = intel_ring_begin(rq, 4 * GEN8_3LVL_PDPES + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + /* Ensure the LRI have landed before we invalidate & continue */ + *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED; + for (i = GEN8_3LVL_PDPES; i--; ) { + const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); + u32 base = engine->mmio_base; + + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, i)); + *cs++ = upper_32_bits(pd_daddr); + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(base, i)); + *cs++ = lower_32_bits(pd_daddr); + } + *cs++ = MI_NOOP; + + intel_ring_advance(rq, cs); + return 0; } @@ -3175,6 +3571,12 @@ static int execlists_request_alloc(struct i915_request *request) * to cancel/unwind this request now. */ + if (!i915_vm_is_4lvl(request->context->vm)) { + ret = emit_pdps(request); + if (ret) + return ret; + } + /* Unconditionally invalidate GPU caches and TLBs. */ ret = request->engine->emit_flush(request, EMIT_INVALIDATE); if (ret) @@ -3475,7 +3877,8 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) ret = lrc_setup_wa_ctx(engine); if (ret) { - DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); + drm_dbg(&engine->i915->drm, + "Failed to setup context WA page: %d\n", ret); return ret; } @@ -3508,6 +3911,72 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) return ret; } +static void reset_csb_pointers(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + const unsigned int reset_value = execlists->csb_size - 1; + + ring_set_paused(engine, 0); + + /* + * Sometimes Icelake forgets to reset its pointers on a GPU reset. + * Bludgeon them with a mmio update to be sure. + */ + ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, + 0xffff << 16 | reset_value << 8 | reset_value); + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + + /* + * After a reset, the HW starts writing into CSB entry [0]. We + * therefore have to set our HEAD pointer back one entry so that + * the *first* entry we check is entry 0. To complicate this further, + * as we don't wait for the first interrupt after reset, we have to + * fake the HW write to point back to the last entry so that our + * inline comparison of our cached head position against the last HW + * write works even before the first interrupt. + */ + execlists->csb_head = reset_value; + WRITE_ONCE(*execlists->csb_write, reset_value); + wmb(); /* Make sure this is visible to HW (paranoia?) */ + + invalidate_csb_entries(&execlists->csb_status[0], + &execlists->csb_status[reset_value]); + + /* Once more for luck and our trusty paranoia */ + ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, + 0xffff << 16 | reset_value << 8 | reset_value); + ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); + + GEM_BUG_ON(READ_ONCE(*execlists->csb_write) != reset_value); +} + +static void execlists_sanitize(struct intel_engine_cs *engine) +{ + /* + * Poison residual state on resume, in case the suspend didn't! + * + * We have to assume that across suspend/resume (or other loss + * of control) that the contents of our pinned buffers has been + * lost, replaced by garbage. Since this doesn't always happen, + * let's poison such state so that we more quickly spot when + * we falsely assume it has been preserved. + */ + if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) + memset(engine->status_page.addr, POISON_INUSE, PAGE_SIZE); + + reset_csb_pointers(engine); + + /* + * The kernel_context HWSP is stored in the status_page. As above, + * that may be lost on resume/initialisation, and so we need to + * reset the value in the HWSP. + */ + intel_timeline_reset_seqno(engine->kernel_context->timeline); + + /* And scrub the dirty cachelines for the HWSP */ + clflush_cache_range(engine->status_page.addr, PAGE_SIZE); +} + static void enable_error_interrupt(struct intel_engine_cs *engine) { u32 status; @@ -3518,7 +3987,7 @@ static void enable_error_interrupt(struct intel_engine_cs *engine) status = ENGINE_READ(engine, RING_ESR); if (unlikely(status)) { - dev_err(engine->i915->drm.dev, + drm_err(&engine->i915->drm, "engine '%s' resumed still in error: %08x\n", engine->name, status); __intel_gt_reset(engine->gt, engine->mask); @@ -3582,7 +4051,8 @@ static bool unexpected_starting_state(struct intel_engine_cs *engine) bool unexpected = false; if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) { - DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n"); + drm_dbg(&engine->i915->drm, + "STOP_RING still set in RING_MI_MODE\n"); unexpected = true; } @@ -3642,39 +4112,10 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) * * FIXME: Wa for more modern gens needs to be validated */ + ring_set_paused(engine, 1); intel_engine_stop_cs(engine); -} - -static void reset_csb_pointers(struct intel_engine_cs *engine) -{ - struct intel_engine_execlists * const execlists = &engine->execlists; - const unsigned int reset_value = execlists->csb_size - 1; - - ring_set_paused(engine, 0); - - /* - * After a reset, the HW starts writing into CSB entry [0]. We - * therefore have to set our HEAD pointer back one entry so that - * the *first* entry we check is entry 0. To complicate this further, - * as we don't wait for the first interrupt after reset, we have to - * fake the HW write to point back to the last entry so that our - * inline comparison of our cached head position against the last HW - * write works even before the first interrupt. - */ - execlists->csb_head = reset_value; - WRITE_ONCE(*execlists->csb_write, reset_value); - wmb(); /* Make sure this is visible to HW (paranoia?) */ - /* - * Sometimes Icelake forgets to reset its pointers on a GPU reset. - * Bludgeon them with a mmio update to be sure. - */ - ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR, - reset_value << 8 | reset_value); - ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR); - - invalidate_csb_entries(&execlists->csb_status[0], - &execlists->csb_status[reset_value]); + engine->execlists.reset_ccid = active_ccid(engine); } static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine) @@ -3717,7 +4158,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) * its request, it was still running at the time of the * reset and will have been clobbered. */ - rq = execlists_active(execlists); + rq = active_context(engine, engine->execlists.reset_ccid); if (!rq) goto unwind; @@ -3767,8 +4208,6 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) * image back to the expected values to skip over the guilty request. */ __i915_request_reset(rq, stalled); - if (!stalled) - goto out_replay; /* * We want a simple context + ring to execute the breadcrumb update. @@ -3778,9 +4217,6 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) * future request will be after userspace has had the opportunity * to recreate its own state. */ - GEM_BUG_ON(!intel_context_is_pinned(ce)); - restore_default_state(ce, engine); - out_replay: ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n", head, ce->ring->tail); @@ -4146,6 +4582,42 @@ static u32 preparser_disable(bool state) return MI_ARB_CHECK | 1 << 8 | state; } +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine) +{ + static const i915_reg_t vd[] = { + GEN12_VD0_AUX_NV, + GEN12_VD1_AUX_NV, + GEN12_VD2_AUX_NV, + GEN12_VD3_AUX_NV, + }; + + static const i915_reg_t ve[] = { + GEN12_VE0_AUX_NV, + GEN12_VE1_AUX_NV, + }; + + if (engine->class == VIDEO_DECODE_CLASS) + return vd[engine->instance]; + + if (engine->class == VIDEO_ENHANCEMENT_CLASS) + return ve[engine->instance]; + + GEM_BUG_ON("unknown aux_inv_reg\n"); + + return INVALID_MMIO_REG; +} + +static u32 * +gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs) +{ + *cs++ = MI_LOAD_REGISTER_IMM(1); + *cs++ = i915_mmio_reg_offset(inv_reg); + *cs++ = AUX_INV; + *cs++ = MI_NOOP; + + return cs; +} + static int gen12_emit_flush_render(struct i915_request *request, u32 mode) { @@ -4154,13 +4626,13 @@ static int gen12_emit_flush_render(struct i915_request *request, u32 *cs; flags |= PIPE_CONTROL_TILE_CACHE_FLUSH; + flags |= PIPE_CONTROL_FLUSH_L3; flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; /* Wa_1409600907:tgl */ flags |= PIPE_CONTROL_DEPTH_STALL; flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; flags |= PIPE_CONTROL_FLUSH_ENABLE; - flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH; flags |= PIPE_CONTROL_STORE_DATA_INDEX; flags |= PIPE_CONTROL_QW_WRITE; @@ -4171,7 +4643,9 @@ static int gen12_emit_flush_render(struct i915_request *request, if (IS_ERR(cs)) return PTR_ERR(cs); - cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + cs = gen12_emit_pipe_control(cs, + PIPE_CONTROL0_HDC_PIPELINE_FLUSH, + flags, LRC_PPHWSP_SCRATCH_ADDR); intel_ring_advance(request, cs); } @@ -4186,14 +4660,13 @@ static int gen12_emit_flush_render(struct i915_request *request, flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; - flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STORE_DATA_INDEX; flags |= PIPE_CONTROL_QW_WRITE; flags |= PIPE_CONTROL_CS_STALL; - cs = intel_ring_begin(request, 8); + cs = intel_ring_begin(request, 8 + 4); if (IS_ERR(cs)) return PTR_ERR(cs); @@ -4206,6 +4679,9 @@ static int gen12_emit_flush_render(struct i915_request *request, cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR); + /* hsdes: 1809175790 */ + cs = gen12_emit_aux_table_inv(GEN12_GFX_CCS_AUX_NV, cs); + *cs++ = preparser_disable(false); intel_ring_advance(request, cs); } @@ -4213,6 +4689,56 @@ static int gen12_emit_flush_render(struct i915_request *request, return 0; } +static int gen12_emit_flush(struct i915_request *request, u32 mode) +{ + intel_engine_mask_t aux_inv = 0; + u32 cmd, *cs; + + if (mode & EMIT_INVALIDATE) + aux_inv = request->engine->mask & ~BIT(BCS0); + + cs = intel_ring_begin(request, + 4 + (aux_inv ? 2 * hweight8(aux_inv) + 2 : 0)); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + cmd = MI_FLUSH_DW + 1; + + /* We always require a command barrier so that subsequent + * commands, such as breadcrumb interrupts, are strictly ordered + * wrt the contents of the write cache being flushed to memory + * (and thus being coherent from the CPU). + */ + cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; + + if (mode & EMIT_INVALIDATE) { + cmd |= MI_INVALIDATE_TLB; + if (request->engine->class == VIDEO_DECODE_CLASS) + cmd |= MI_INVALIDATE_BSD; + } + + *cs++ = cmd; + *cs++ = LRC_PPHWSP_SCRATCH_ADDR; + *cs++ = 0; /* upper addr */ + *cs++ = 0; /* value */ + + if (aux_inv) { /* hsdes: 1809175790 */ + struct intel_engine_cs *engine; + unsigned int tmp; + + *cs++ = MI_LOAD_REGISTER_IMM(hweight8(aux_inv)); + for_each_engine_masked(engine, request->engine->gt, + aux_inv, tmp) { + *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine)); + *cs++ = AUX_INV; + } + *cs++ = MI_NOOP; + } + intel_ring_advance(request, cs); + + return 0; +} + /* * Reserve space for 2 NOOPs at the end of each request to be * used as a workaround for not being allowed to do lite @@ -4242,8 +4768,7 @@ static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) } static __always_inline u32* -gen8_emit_fini_breadcrumb_footer(struct i915_request *request, - u32 *cs) +gen8_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) { *cs++ = MI_USER_INTERRUPT; @@ -4257,14 +4782,16 @@ gen8_emit_fini_breadcrumb_footer(struct i915_request *request, return gen8_emit_wa_tail(request, cs); } -static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) +static u32 *emit_xcs_breadcrumb(struct i915_request *request, u32 *cs) { - cs = gen8_emit_ggtt_write(cs, - request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, - 0); + u32 addr = i915_request_active_timeline(request)->hwsp_offset; + + return gen8_emit_ggtt_write(cs, request->fence.seqno, addr, 0); +} - return gen8_emit_fini_breadcrumb_footer(request, cs); +static u32 *gen8_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) +{ + return gen8_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); } static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) @@ -4282,7 +4809,7 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_CS_STALL); - return gen8_emit_fini_breadcrumb_footer(request, cs); + return gen8_emit_fini_breadcrumb_tail(request, cs); } static u32 * @@ -4298,7 +4825,7 @@ gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) PIPE_CONTROL_DC_FLUSH_ENABLE | PIPE_CONTROL_FLUSH_ENABLE); - return gen8_emit_fini_breadcrumb_footer(request, cs); + return gen8_emit_fini_breadcrumb_tail(request, cs); } /* @@ -4336,7 +4863,7 @@ static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs) } static __always_inline u32* -gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) +gen12_emit_fini_breadcrumb_tail(struct i915_request *request, u32 *cs) { *cs++ = MI_USER_INTERRUPT; @@ -4350,33 +4877,29 @@ gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs) return gen8_emit_wa_tail(request, cs); } -static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) +static u32 *gen12_emit_fini_breadcrumb(struct i915_request *rq, u32 *cs) { - cs = gen8_emit_ggtt_write(cs, - request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, - 0); - - return gen12_emit_fini_breadcrumb_footer(request, cs); + return gen12_emit_fini_breadcrumb_tail(rq, emit_xcs_breadcrumb(rq, cs)); } static u32 * gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) { - cs = gen8_emit_ggtt_write_rcs(cs, - request->fence.seqno, - i915_request_active_timeline(request)->hwsp_offset, - PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_TILE_CACHE_FLUSH | - PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | - PIPE_CONTROL_DEPTH_CACHE_FLUSH | - /* Wa_1409600907:tgl */ - PIPE_CONTROL_DEPTH_STALL | - PIPE_CONTROL_DC_FLUSH_ENABLE | - PIPE_CONTROL_FLUSH_ENABLE | - PIPE_CONTROL_HDC_PIPELINE_FLUSH); + cs = gen12_emit_ggtt_write_rcs(cs, + request->fence.seqno, + i915_request_active_timeline(request)->hwsp_offset, + PIPE_CONTROL0_HDC_PIPELINE_FLUSH, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_TILE_CACHE_FLUSH | + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + /* Wa_1409600907:tgl */ + PIPE_CONTROL_DEPTH_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_FLUSH_ENABLE); - return gen12_emit_fini_breadcrumb_footer(request, cs); + return gen12_emit_fini_breadcrumb_tail(request, cs); } static void execlists_park(struct intel_engine_cs *engine) @@ -4428,6 +4951,8 @@ static void execlists_shutdown(struct intel_engine_cs *engine) static void execlists_release(struct intel_engine_cs *engine) { + engine->sanitize = NULL; /* no longer in control, nothing to sanitize */ + execlists_shutdown(engine); intel_engine_cleanup_common(engine); @@ -4447,9 +4972,10 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine) engine->emit_flush = gen8_emit_flush; engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb; engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb; - if (INTEL_GEN(engine->i915) >= 12) + if (INTEL_GEN(engine->i915) >= 12) { engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb; - + engine->emit_flush = gen12_emit_flush; + } engine->set_default_submission = intel_execlists_set_default_submission; if (INTEL_GEN(engine->i915) < 11) { @@ -4530,7 +5056,7 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) * because we only expect rare glitches but nothing * critical to prevent us from using GPU */ - DRM_ERROR("WA batch buffer initialization failed\n"); + drm_err(&i915->drm, "WA batch buffer initialization failed\n"); if (HAS_LOGICAL_RING_ELSQ(i915)) { execlists->submit_reg = uncore->regs + @@ -4558,48 +5084,13 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine) execlists->ccid |= engine->class << (GEN11_ENGINE_CLASS_SHIFT - 32); } - reset_csb_pointers(engine); - /* Finally, take ownership and responsibility for cleanup! */ + engine->sanitize = execlists_sanitize; engine->release = execlists_release; return 0; } -static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine) -{ - u32 indirect_ctx_offset; - - switch (INTEL_GEN(engine->i915)) { - default: - MISSING_CASE(INTEL_GEN(engine->i915)); - /* fall through */ - case 12: - indirect_ctx_offset = - GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - break; - case 11: - indirect_ctx_offset = - GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - break; - case 10: - indirect_ctx_offset = - GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - break; - case 9: - indirect_ctx_offset = - GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - break; - case 8: - indirect_ctx_offset = - GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT; - break; - } - - return indirect_ctx_offset; -} - - static void init_common_reg_state(u32 * const regs, const struct intel_engine_cs *engine, const struct intel_ring *ring, @@ -4617,30 +5108,27 @@ static void init_common_reg_state(u32 * const regs, regs[CTX_CONTEXT_CONTROL] = ctl; regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID; + regs[CTX_TIMESTAMP] = 0; } static void init_wa_bb_reg_state(u32 * const regs, - const struct intel_engine_cs *engine, - u32 pos_bb_per_ctx) + const struct intel_engine_cs *engine) { const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx; if (wa_ctx->per_ctx.size) { const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); - regs[pos_bb_per_ctx] = + GEM_BUG_ON(lrc_ring_wa_bb_per_ctx(engine) == -1); + regs[lrc_ring_wa_bb_per_ctx(engine) + 1] = (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; } if (wa_ctx->indirect_ctx.size) { - const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); - - regs[pos_bb_per_ctx + 2] = - (ggtt_offset + wa_ctx->indirect_ctx.offset) | - (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); - - regs[pos_bb_per_ctx + 4] = - intel_lr_indirect_ctx_offset(engine) << 6; + lrc_ring_setup_indirect_ctx(regs, engine, + i915_ggtt_offset(wa_ctx->vma) + + wa_ctx->indirect_ctx.offset, + wa_ctx->indirect_ctx.size); } } @@ -4689,10 +5177,7 @@ static void execlists_init_reg_state(u32 *regs, init_common_reg_state(regs, engine, ring, inhibit); init_ppgtt_reg_state(regs, vm_alias(ce->vm)); - init_wa_bb_reg_state(regs, engine, - INTEL_GEN(engine->i915) >= 12 ? - GEN12_CTX_BB_PER_CTX_PTR : - CTX_BB_PER_CTX_PTR); + init_wa_bb_reg_state(regs, engine); __reset_stop_ring(regs, engine); } @@ -4705,29 +5190,18 @@ populate_lr_context(struct intel_context *ce, { bool inhibit = true; void *vaddr; - int ret; vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB); if (IS_ERR(vaddr)) { - ret = PTR_ERR(vaddr); - DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret); - return ret; + drm_dbg(&engine->i915->drm, "Could not map object pages!\n"); + return PTR_ERR(vaddr); } set_redzone(vaddr, engine); if (engine->default_state) { - void *defaults; - - defaults = i915_gem_object_pin_map(engine->default_state, - I915_MAP_WB); - if (IS_ERR(defaults)) { - ret = PTR_ERR(defaults); - goto err_unpin_ctx; - } - - memcpy(vaddr, defaults, engine->context_size); - i915_gem_object_unpin_map(engine->default_state); + shmem_read(engine->default_state, 0, + vaddr, engine->context_size); __set_bit(CONTEXT_VALID_BIT, &ce->flags); inhibit = false; } @@ -4739,14 +5213,12 @@ populate_lr_context(struct intel_context *ce, * The second page of the context object contains some registers which * must be set up prior to the first execution. */ - execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE, + execlists_init_reg_state(vaddr + LRC_STATE_OFFSET, ce, engine, ring, inhibit); - ret = 0; -err_unpin_ctx: __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size); i915_gem_object_unpin_map(ctx_obj); - return ret; + return 0; } static int __execlists_context_alloc(struct intel_context *ce, @@ -4764,6 +5236,11 @@ static int __execlists_context_alloc(struct intel_context *ce, if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) context_size += I915_GTT_PAGE_SIZE; /* for redzone */ + if (INTEL_GEN(engine->i915) == 12) { + ce->wa_bb_page = context_size / PAGE_SIZE; + context_size += PAGE_SIZE; + } + ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size); if (IS_ERR(ctx_obj)) return PTR_ERR(ctx_obj); @@ -4803,7 +5280,8 @@ static int __execlists_context_alloc(struct intel_context *ce, ret = populate_lr_context(ce, ctx_obj, engine, ring); if (ret) { - DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret); + drm_dbg(&engine->i915->drm, + "Failed to populate LRC: %d\n", ret); goto error_ring_free; } @@ -4856,6 +5334,8 @@ static void virtual_context_destroy(struct kref *kref) __execlists_context_fini(&ve->context); intel_context_fini(&ve->context); + intel_engine_free_request_pool(&ve->base); + kfree(ve->bonds); kfree(ve); } @@ -4980,12 +5460,15 @@ static void virtual_submission_tasklet(unsigned long data) return; local_irq_disable(); - for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) { - struct intel_engine_cs *sibling = ve->siblings[n]; + for (n = 0; n < ve->num_siblings; n++) { + struct intel_engine_cs *sibling = READ_ONCE(ve->siblings[n]); struct ve_node * const node = &ve->nodes[sibling->id]; struct rb_node **parent, *rb; bool first; + if (!READ_ONCE(ve->request)) + break; /* already handled by a sibling's tasklet */ + if (unlikely(!(mask & sibling->mask))) { if (!RB_EMPTY_NODE(&node->rb)) { spin_lock(&sibling->active.lock); @@ -5036,10 +5519,8 @@ static void virtual_submission_tasklet(unsigned long data) submit_engine: GEM_BUG_ON(RB_EMPTY_NODE(&node->rb)); node->prio = prio; - if (first && prio > sibling->execlists.queue_priority_hint) { - sibling->execlists.queue_priority_hint = prio; + if (first && prio > sibling->execlists.queue_priority_hint) tasklet_hi_schedule(&sibling->execlists.tasklet); - } spin_unlock(&sibling->active.lock); } |