diff options
Diffstat (limited to 'drivers/gpu/drm/i915/intel_lrc.c')
-rw-r--r-- | drivers/gpu/drm/i915/intel_lrc.c | 1001 |
1 files changed, 378 insertions, 623 deletions
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index ebf8023d21e6..c8f7c631fc1f 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -190,13 +190,7 @@ #define CTX_R_PWR_CLK_STATE 0x42 #define CTX_GPGPU_CSR_BASE_ADDRESS 0x44 -#define GEN8_CTX_VALID (1<<0) -#define GEN8_CTX_FORCE_PD_RESTORE (1<<1) -#define GEN8_CTX_FORCE_RESTORE (1<<2) -#define GEN8_CTX_L3LLC_COHERENT (1<<5) -#define GEN8_CTX_PRIVILEGE (1<<8) - -#define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \ +#define CTX_REG(reg_state, pos, reg, val) do { \ (reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \ (reg_state)[(pos)+1] = (val); \ } while (0) @@ -212,14 +206,6 @@ reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \ } while (0) -enum { - FAULT_AND_HANG = 0, - FAULT_AND_HALT, /* Debug only */ - FAULT_AND_STREAM, - FAULT_AND_CONTINUE /* Unsupported */ -}; -#define GEN8_CTX_ID_SHIFT 32 -#define GEN8_CTX_ID_WIDTH 21 #define GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 #define GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x26 @@ -267,30 +253,6 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enabl return 0; } -static void -logical_ring_init_platform_invariants(struct intel_engine_cs *engine) -{ - struct drm_i915_private *dev_priv = engine->i915; - - engine->disable_lite_restore_wa = - IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1) && - (engine->id == VCS || engine->id == VCS2); - - engine->ctx_desc_template = GEN8_CTX_VALID; - if (IS_GEN8(dev_priv)) - engine->ctx_desc_template |= GEN8_CTX_L3LLC_COHERENT; - engine->ctx_desc_template |= GEN8_CTX_PRIVILEGE; - - /* TODO: WaDisableLiteRestore when we start using semaphore - * signalling between Command Streamers */ - /* ring->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; */ - - /* WaEnableForceRestoreInCtxtDescForVCS:skl */ - /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ - if (engine->disable_lite_restore_wa) - engine->ctx_desc_template |= GEN8_CTX_FORCE_RESTORE; -} - /** * intel_lr_context_descriptor_update() - calculate & cache the descriptor * descriptor for a pinned context @@ -304,7 +266,7 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine) * * This is what a descriptor looks like, from LSB to MSB:: * - * bits 0-11: flags, GEN8_CTX_* (cached in ctx_desc_template) + * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) * bits 12-31: LRCA, GTT address of (the HWSP of) this context * bits 32-52: ctx ID, a globally unique tag * bits 53-54: mbz, reserved for use by hardware @@ -319,8 +281,7 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx, BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (1<<GEN8_CTX_ID_WIDTH)); - desc = ctx->desc_template; /* bits 3-4 */ - desc |= engine->ctx_desc_template; /* bits 0-11 */ + desc = ctx->desc_template; /* bits 0-11 */ desc |= i915_ggtt_offset(ce->state) + LRC_PPHWSP_PN * PAGE_SIZE; /* bits 12-31 */ desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ @@ -345,7 +306,8 @@ execlists_context_status_change(struct drm_i915_gem_request *rq, if (!IS_ENABLED(CONFIG_DRM_I915_GVT)) return; - atomic_notifier_call_chain(&rq->ctx->status_notifier, status, rq); + atomic_notifier_call_chain(&rq->engine->context_status_notifier, + status, rq); } static void @@ -364,6 +326,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) rq->ctx->ppgtt ?: rq->i915->mm.aliasing_ppgtt; u32 *reg_state = ce->lrc_reg_state; + assert_ring_tail_valid(rq->ring, rq->tail); reg_state[CTX_RING_TAIL+1] = rq->tail; /* True 32b PPGTT with dynamic page allocation: update PDP @@ -371,7 +334,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) * PML4 is allocated during ppgtt init, so this is not needed * in 48-bit mode. */ - if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) + if (ppgtt && !i915_vm_is_48bit(&ppgtt->base)) execlists_update_context_pdps(ppgtt, reg_state); return ce->lrc_desc; @@ -385,17 +348,20 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine)); u64 desc[2]; + GEM_BUG_ON(port[0].count > 1); if (!port[0].count) execlists_context_status_change(port[0].request, INTEL_CONTEXT_SCHEDULE_IN); desc[0] = execlists_update_context(port[0].request); - engine->preempt_wa = port[0].count++; /* bdw only? fixed on skl? */ + GEM_DEBUG_EXEC(port[0].context_id = upper_32_bits(desc[0])); + port[0].count++; if (port[1].request) { GEM_BUG_ON(port[1].count); execlists_context_status_change(port[1].request, INTEL_CONTEXT_SCHEDULE_IN); desc[1] = execlists_update_context(port[1].request); + GEM_DEBUG_EXEC(port[1].context_id = upper_32_bits(desc[1])); port[1].count = 1; } else { desc[1] = 0; @@ -433,7 +399,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) { struct drm_i915_gem_request *last; struct execlist_port *port = engine->execlist_port; - unsigned long flags; struct rb_node *rb; bool submit = false; @@ -470,7 +435,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * and context switches) submission. */ - spin_lock_irqsave(&engine->timeline->lock, flags); + spin_lock_irq(&engine->timeline->lock); rb = engine->execlist_first; while (rb) { struct drm_i915_gem_request *cursor = @@ -514,6 +479,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) cursor->priotree.priority = INT_MAX; __i915_gem_request_submit(cursor); + trace_i915_gem_request_in(cursor, port - engine->execlist_port); last = cursor; submit = true; } @@ -521,7 +487,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) i915_gem_request_assign(&port->request, last); engine->execlist_first = rb; } - spin_unlock_irqrestore(&engine->timeline->lock, flags); + spin_unlock_irq(&engine->timeline->lock); if (submit) execlists_submit_ports(engine); @@ -532,37 +498,11 @@ static bool execlists_elsp_idle(struct intel_engine_cs *engine) return !engine->execlist_port[0].request; } -/** - * intel_execlists_idle() - Determine if all engine submission ports are idle - * @dev_priv: i915 device private - * - * Return true if there are no requests pending on any of the submission ports - * of any engines. - */ -bool intel_execlists_idle(struct drm_i915_private *dev_priv) +static bool execlists_elsp_ready(const struct intel_engine_cs *engine) { - struct intel_engine_cs *engine; - enum intel_engine_id id; - - if (!i915.enable_execlists) - return true; - - for_each_engine(engine, dev_priv, id) - if (!execlists_elsp_idle(engine)) - return false; + const struct execlist_port *port = engine->execlist_port; - return true; -} - -static bool execlists_elsp_ready(struct intel_engine_cs *engine) -{ - int port; - - port = 1; /* wait for a free slot */ - if (engine->disable_lite_restore_wa || engine->preempt_wa) - port = 0; /* wait for GPU to be idle before continuing */ - - return !engine->execlist_port[port].request; + return port[0].count + port[1].count < 2; } /* @@ -577,44 +517,80 @@ static void intel_lrc_irq_handler(unsigned long data) intel_uncore_forcewake_get(dev_priv, engine->fw_domains); - if (!execlists_elsp_idle(engine)) { + /* Prefer doing test_and_clear_bit() as a two stage operation to avoid + * imposing the cost of a locked atomic transaction when submitting a + * new request (outside of the context-switch interrupt). + */ + while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) { u32 __iomem *csb_mmio = dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)); u32 __iomem *buf = dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)); - unsigned int csb, head, tail; - - csb = readl(csb_mmio); - head = GEN8_CSB_READ_PTR(csb); - tail = GEN8_CSB_WRITE_PTR(csb); - if (tail < head) - tail += GEN8_CSB_ENTRIES; - while (head < tail) { - unsigned int idx = ++head % GEN8_CSB_ENTRIES; - unsigned int status = readl(buf + 2 * idx); + unsigned int head, tail; + + /* The write will be ordered by the uncached read (itself + * a memory barrier), so we do not need another in the form + * of a locked instruction. The race between the interrupt + * handler and the split test/clear is harmless as we order + * our clear before the CSB read. If the interrupt arrived + * first between the test and the clear, we read the updated + * CSB and clear the bit. If the interrupt arrives as we read + * the CSB or later (i.e. after we had cleared the bit) the bit + * is set and we do a new loop. + */ + __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); + head = readl(csb_mmio); + tail = GEN8_CSB_WRITE_PTR(head); + head = GEN8_CSB_READ_PTR(head); + while (head != tail) { + unsigned int status; + + if (++head == GEN8_CSB_ENTRIES) + head = 0; + + /* We are flying near dragons again. + * + * We hold a reference to the request in execlist_port[] + * but no more than that. We are operating in softirq + * context and so cannot hold any mutex or sleep. That + * prevents us stopping the requests we are processing + * in port[] from being retired simultaneously (the + * breadcrumb will be complete before we see the + * context-switch). As we only hold the reference to the + * request, any pointer chasing underneath the request + * is subject to a potential use-after-free. Thus we + * store all of the bookkeeping within port[] as + * required, and avoid using unguarded pointers beneath + * request itself. The same applies to the atomic + * status notifier. + */ + status = readl(buf + 2 * head); if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) continue; + /* Check the context/desc id for this event matches */ + GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != + port[0].context_id); + GEM_BUG_ON(port[0].count == 0); if (--port[0].count == 0) { GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); + GEM_BUG_ON(!i915_gem_request_completed(port[0].request)); execlists_context_status_change(port[0].request, INTEL_CONTEXT_SCHEDULE_OUT); + trace_i915_gem_request_out(port[0].request); i915_gem_request_put(port[0].request); port[0] = port[1]; memset(&port[1], 0, sizeof(port[1])); - - engine->preempt_wa = false; } GEM_BUG_ON(port[0].count == 0 && !(status & GEN8_CTX_STATUS_ACTIVE_IDLE)); } - writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, - GEN8_CSB_WRITE_PTR(csb) << 8), + writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8), csb_mmio); } @@ -658,10 +634,11 @@ static void execlists_submit_request(struct drm_i915_gem_request *request) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&engine->timeline->lock, flags); - if (insert_request(&request->priotree, &engine->execlist_queue)) + if (insert_request(&request->priotree, &engine->execlist_queue)) { engine->execlist_first = &request->priotree.node; - if (execlists_elsp_idle(engine)) - tasklet_hi_schedule(&engine->irq_tasklet); + if (execlists_elsp_ready(engine)) + tasklet_hi_schedule(&engine->irq_tasklet); + } spin_unlock_irqrestore(&engine->timeline->lock, flags); } @@ -669,15 +646,14 @@ static void execlists_submit_request(struct drm_i915_gem_request *request) static struct intel_engine_cs * pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) { - struct intel_engine_cs *engine; + struct intel_engine_cs *engine = + container_of(pt, struct drm_i915_gem_request, priotree)->engine; + + GEM_BUG_ON(!locked); - engine = container_of(pt, - struct drm_i915_gem_request, - priotree)->engine; if (engine != locked) { - if (locked) - spin_unlock_irq(&locked->timeline->lock); - spin_lock_irq(&engine->timeline->lock); + spin_unlock(&locked->timeline->lock); + spin_lock(&engine->timeline->lock); } return engine; @@ -685,7 +661,7 @@ pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) static void execlists_schedule(struct drm_i915_gem_request *request, int prio) { - struct intel_engine_cs *engine = NULL; + struct intel_engine_cs *engine; struct i915_dependency *dep, *p; struct i915_dependency stack; LIST_HEAD(dfs); @@ -719,26 +695,23 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) list_for_each_entry_safe(dep, p, &dfs, dfs_link) { struct i915_priotree *pt = dep->signaler; - list_for_each_entry(p, &pt->signalers_list, signal_link) + /* Within an engine, there can be no cycle, but we may + * refer to the same dependency chain multiple times + * (redundant dependencies are not eliminated) and across + * engines. + */ + list_for_each_entry(p, &pt->signalers_list, signal_link) { + GEM_BUG_ON(p->signaler->priority < pt->priority); if (prio > READ_ONCE(p->signaler->priority)) list_move_tail(&p->dfs_link, &dfs); + } list_safe_reset_next(dep, p, dfs_link); - if (!RB_EMPTY_NODE(&pt->node)) - continue; - - engine = pt_lock_engine(pt, engine); - - /* If it is not already in the rbtree, we can update the - * priority inplace and skip over it (and its dependencies) - * if it is referenced *again* as we descend the dfs. - */ - if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) { - pt->priority = prio; - list_del_init(&dep->dfs_link); - } } + engine = request->engine; + spin_lock_irq(&engine->timeline->lock); + /* Fifo and depth-first replacement ensure our deps execute before us */ list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) { struct i915_priotree *pt = dep->signaler; @@ -750,16 +723,15 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) if (prio <= pt->priority) continue; - GEM_BUG_ON(RB_EMPTY_NODE(&pt->node)); - pt->priority = prio; - rb_erase(&pt->node, &engine->execlist_queue); - if (insert_request(pt, &engine->execlist_queue)) - engine->execlist_first = &pt->node; + if (!RB_EMPTY_NODE(&pt->node)) { + rb_erase(&pt->node, &engine->execlist_queue); + if (insert_request(pt, &engine->execlist_queue)) + engine->execlist_first = &pt->node; + } } - if (engine) - spin_unlock_irq(&engine->timeline->lock); + spin_unlock_irq(&engine->timeline->lock); /* XXX Do we need to preempt to make room for us and our deps? */ } @@ -776,6 +748,7 @@ static int execlists_context_pin(struct intel_engine_cs *engine, if (ce->pin_count++) return 0; + GEM_BUG_ON(!ce->pin_count); /* no overflow please! */ if (!ce->state) { ret = execlists_context_deferred_alloc(ctx, engine); @@ -784,11 +757,9 @@ static int execlists_context_pin(struct intel_engine_cs *engine, } GEM_BUG_ON(!ce->state); - flags = PIN_GLOBAL; + flags = PIN_GLOBAL | PIN_HIGH; if (ctx->ggtt_offset_bias) flags |= PIN_OFFSET_BIAS | ctx->ggtt_offset_bias; - if (i915_gem_context_is_kernel(ctx)) - flags |= PIN_HIGH; ret = i915_vma_pin(ce->state, 0, GEN8_LR_CONTEXT_ALIGN, flags); if (ret) @@ -847,6 +818,7 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request) { struct intel_engine_cs *engine = request->engine; struct intel_context *ce = &request->ctx->engine[engine->id]; + u32 *cs; int ret; GEM_BUG_ON(!ce->pin_count); @@ -871,9 +843,11 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request) goto err; } - ret = intel_ring_begin(request, 0); - if (ret) + cs = intel_ring_begin(request, 0); + if (IS_ERR(cs)) { + ret = PTR_ERR(cs); goto err_unreserve; + } if (!ce->initialised) { ret = engine->init_context(request); @@ -900,51 +874,6 @@ err: return ret; } -static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) -{ - int ret, i; - struct intel_ring *ring = req->ring; - struct i915_workarounds *w = &req->i915->workarounds; - - if (w->count == 0) - return 0; - - ret = req->engine->emit_flush(req, EMIT_BARRIER); - if (ret) - return ret; - - ret = intel_ring_begin(req, w->count * 2 + 2); - if (ret) - return ret; - - intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(w->count)); - for (i = 0; i < w->count; i++) { - intel_ring_emit_reg(ring, w->reg[i].addr); - intel_ring_emit(ring, w->reg[i].value); - } - intel_ring_emit(ring, MI_NOOP); - - intel_ring_advance(ring); - - ret = req->engine->emit_flush(req, EMIT_BARRIER); - if (ret) - return ret; - - return 0; -} - -#define wa_ctx_emit(batch, index, cmd) \ - do { \ - int __index = (index)++; \ - if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ - return -ENOSPC; \ - } \ - batch[__index] = (cmd); \ - } while (0) - -#define wa_ctx_emit_reg(batch, index, reg) \ - wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg)) - /* * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after * PIPE_CONTROL instruction. This is required for the flush to happen correctly @@ -961,56 +890,29 @@ static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req) * This WA is also required for Gen9 so extracting as a function avoids * code duplication. */ -static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, - uint32_t *batch, - uint32_t index) +static u32 * +gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) { - uint32_t l3sqc4_flush = (0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES); - - wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 | - MI_SRM_LRM_GLOBAL_GTT)); - wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); - wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256); - wa_ctx_emit(batch, index, 0); - - wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); - wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); - wa_ctx_emit(batch, index, l3sqc4_flush); - - wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); - wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_DC_FLUSH_ENABLE)); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - - wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 | - MI_SRM_LRM_GLOBAL_GTT)); - wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4); - wa_ctx_emit(batch, index, i915_ggtt_offset(engine->scratch) + 256); - wa_ctx_emit(batch, index, 0); - - return index; -} + *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = i915_ggtt_offset(engine->scratch) + 256; + *batch++ = 0; -static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, - uint32_t offset, - uint32_t start_alignment) -{ - return wa_ctx->offset = ALIGN(offset, start_alignment); -} + *batch++ = MI_LOAD_REGISTER_IMM(1); + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES; -static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, - uint32_t offset, - uint32_t size_alignment) -{ - wa_ctx->size = offset - wa_ctx->offset; + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); - WARN(wa_ctx->size % size_alignment, - "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", - wa_ctx->size, size_alignment); - return 0; + *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; + *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); + *batch++ = i915_ggtt_offset(engine->scratch) + 256; + *batch++ = 0; + + return batch; } /* @@ -1028,42 +930,28 @@ static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together * makes a complete batch buffer. */ -static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine, - struct i915_wa_ctx_bb *wa_ctx, - uint32_t *batch, - uint32_t *offset) +static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) { - uint32_t scratch_addr; - uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - /* WaDisableCtxRestoreArbitration:bdw,chv */ - wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ - if (IS_BROADWELL(engine->i915)) { - int rc = gen8_emit_flush_coherentl3_wa(engine, batch, index); - if (rc < 0) - return rc; - index = rc; - } + if (IS_BROADWELL(engine->i915)) + batch = gen8_emit_flush_coherentl3_wa(engine, batch); /* WaClearSlmSpaceAtContextSwitch:bdw,chv */ /* Actual scratch location is at 128 bytes offset */ - scratch_addr = i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES; - - wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); - wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | - PIPE_CONTROL_GLOBAL_GTT_IVB | - PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_QW_WRITE)); - wa_ctx_emit(batch, index, scratch_addr); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + i915_ggtt_offset(engine->scratch) + + 2 * CACHELINE_BYTES); /* Pad to end of cacheline */ - while (index % CACHELINE_DWORDS) - wa_ctx_emit(batch, index, MI_NOOP); + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; /* * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because @@ -1071,7 +959,7 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine, * in the register CTX_RCS_INDIRECT_CTX */ - return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); + return batch; } /* @@ -1083,65 +971,40 @@ static int gen8_init_indirectctx_bb(struct intel_engine_cs *engine, * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. */ -static int gen8_init_perctx_bb(struct intel_engine_cs *engine, - struct i915_wa_ctx_bb *wa_ctx, - uint32_t *batch, - uint32_t *offset) +static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) { - uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - /* WaDisableCtxRestoreArbitration:bdw,chv */ - wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); - - wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + *batch++ = MI_BATCH_BUFFER_END; - return wa_ctx_end(wa_ctx, *offset = index, 1); + return batch; } -static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine, - struct i915_wa_ctx_bb *wa_ctx, - uint32_t *batch, - uint32_t *offset) +static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) { - int ret; - struct drm_i915_private *dev_priv = engine->i915; - uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - - /* WaDisableCtxRestoreArbitration:bxt */ - if (IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) - wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_DISABLE); - - /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */ - ret = gen8_emit_flush_coherentl3_wa(engine, batch, index); - if (ret < 0) - return ret; - index = ret; + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ + batch = gen8_emit_flush_coherentl3_wa(engine, batch); - /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl */ - wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); - wa_ctx_emit_reg(batch, index, COMMON_SLICE_CHICKEN2); - wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE( - GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE)); - wa_ctx_emit(batch, index, MI_NOOP); + /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */ + *batch++ = MI_LOAD_REGISTER_IMM(1); + *batch++ = i915_mmio_reg_offset(COMMON_SLICE_CHICKEN2); + *batch++ = _MASKED_BIT_DISABLE( + GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE); + *batch++ = MI_NOOP; /* WaClearSlmSpaceAtContextSwitch:kbl */ /* Actual scratch location is at 128 bytes offset */ - if (IS_KBL_REVID(dev_priv, 0, KBL_REVID_A0)) { - u32 scratch_addr = - i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES; - - wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6)); - wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 | - PIPE_CONTROL_GLOBAL_GTT_IVB | - PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_QW_WRITE)); - wa_ctx_emit(batch, index, scratch_addr); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); + if (IS_KBL_REVID(engine->i915, 0, KBL_REVID_A0)) { + batch = gen8_emit_pipe_control(batch, + PIPE_CONTROL_FLUSH_L3 | + PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE, + i915_ggtt_offset(engine->scratch) + + 2 * CACHELINE_BYTES); } - /* WaMediaPoolStateCmdInWABB:bxt */ + /* WaMediaPoolStateCmdInWABB:bxt,glk */ if (HAS_POOLED_EU(engine->i915)) { /* * EU pool configuration is setup along with golden context @@ -1156,73 +1019,37 @@ static int gen9_init_indirectctx_bb(struct intel_engine_cs *engine, * possible configurations, to avoid duplication they are * not shown here again. */ - u32 eu_pool_config = 0x00777000; - wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_STATE); - wa_ctx_emit(batch, index, GEN9_MEDIA_POOL_ENABLE); - wa_ctx_emit(batch, index, eu_pool_config); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); - wa_ctx_emit(batch, index, 0); + *batch++ = GEN9_MEDIA_POOL_STATE; + *batch++ = GEN9_MEDIA_POOL_ENABLE; + *batch++ = 0x00777000; + *batch++ = 0; + *batch++ = 0; + *batch++ = 0; } /* Pad to end of cacheline */ - while (index % CACHELINE_DWORDS) - wa_ctx_emit(batch, index, MI_NOOP); + while ((unsigned long)batch % CACHELINE_BYTES) + *batch++ = MI_NOOP; - return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); + return batch; } -static int gen9_init_perctx_bb(struct intel_engine_cs *engine, - struct i915_wa_ctx_bb *wa_ctx, - uint32_t *batch, - uint32_t *offset) +static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) { - uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - - /* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:bxt */ - if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) { - wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1)); - wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0); - wa_ctx_emit(batch, index, - _MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING)); - wa_ctx_emit(batch, index, MI_NOOP); - } - - /* WaClearTdlStateAckDirtyBits:bxt */ - if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_B0)) { - wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(4)); - - wa_ctx_emit_reg(batch, index, GEN8_STATE_ACK); - wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); - - wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE1); - wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); - - wa_ctx_emit_reg(batch, index, GEN9_STATE_ACK_SLICE2); - wa_ctx_emit(batch, index, _MASKED_BIT_DISABLE(GEN9_SUBSLICE_TDL_ACK_BITS)); + *batch++ = MI_BATCH_BUFFER_END; - wa_ctx_emit_reg(batch, index, GEN7_ROW_CHICKEN2); - /* dummy write to CS, mask bits are 0 to ensure the register is not modified */ - wa_ctx_emit(batch, index, 0x0); - wa_ctx_emit(batch, index, MI_NOOP); - } - - /* WaDisableCtxRestoreArbitration:bxt */ - if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) - wa_ctx_emit(batch, index, MI_ARB_ON_OFF | MI_ARB_ENABLE); - - wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END); - - return wa_ctx_end(wa_ctx, *offset = index, 1); + return batch; } -static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *engine, u32 size) +#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) + +static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) { struct drm_i915_gem_object *obj; struct i915_vma *vma; int err; - obj = i915_gem_object_create(engine->i915, PAGE_ALIGN(size)); + obj = i915_gem_object_create(engine->i915, CTX_WA_BB_OBJ_SIZE); if (IS_ERR(obj)) return PTR_ERR(obj); @@ -1244,82 +1071,79 @@ err: return err; } -static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *engine) +static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine) { i915_vma_unpin_and_release(&engine->wa_ctx.vma); } +typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch); + static int intel_init_workaround_bb(struct intel_engine_cs *engine) { struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; - uint32_t *batch; - uint32_t offset; + struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx, + &wa_ctx->per_ctx }; + wa_bb_func_t wa_bb_fn[2]; struct page *page; + void *batch, *batch_ptr; + unsigned int i; int ret; - WARN_ON(engine->id != RCS); + if (WARN_ON(engine->id != RCS || !engine->scratch)) + return -EINVAL; - /* update this when WA for higher Gen are added */ - if (INTEL_GEN(engine->i915) > 9) { - DRM_ERROR("WA batch buffer is not initialized for Gen%d\n", - INTEL_GEN(engine->i915)); + switch (INTEL_GEN(engine->i915)) { + case 9: + wa_bb_fn[0] = gen9_init_indirectctx_bb; + wa_bb_fn[1] = gen9_init_perctx_bb; + break; + case 8: + wa_bb_fn[0] = gen8_init_indirectctx_bb; + wa_bb_fn[1] = gen8_init_perctx_bb; + break; + default: + MISSING_CASE(INTEL_GEN(engine->i915)); return 0; } - /* some WA perform writes to scratch page, ensure it is valid */ - if (!engine->scratch) { - DRM_ERROR("scratch page not allocated for %s\n", engine->name); - return -EINVAL; - } - - ret = lrc_setup_wa_ctx_obj(engine, PAGE_SIZE); + ret = lrc_setup_wa_ctx(engine); if (ret) { DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); return ret; } page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0); - batch = kmap_atomic(page); - offset = 0; - - if (IS_GEN8(engine->i915)) { - ret = gen8_init_indirectctx_bb(engine, - &wa_ctx->indirect_ctx, - batch, - &offset); - if (ret) - goto out; - - ret = gen8_init_perctx_bb(engine, - &wa_ctx->per_ctx, - batch, - &offset); - if (ret) - goto out; - } else if (IS_GEN9(engine->i915)) { - ret = gen9_init_indirectctx_bb(engine, - &wa_ctx->indirect_ctx, - batch, - &offset); - if (ret) - goto out; + batch = batch_ptr = kmap_atomic(page); - ret = gen9_init_perctx_bb(engine, - &wa_ctx->per_ctx, - batch, - &offset); - if (ret) - goto out; + /* + * Emit the two workaround batch buffers, recording the offset from the + * start of the workaround batch buffer object for each and their + * respective sizes. + */ + for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) { + wa_bb[i]->offset = batch_ptr - batch; + if (WARN_ON(!IS_ALIGNED(wa_bb[i]->offset, CACHELINE_BYTES))) { + ret = -EINVAL; + break; + } + batch_ptr = wa_bb_fn[i](engine, batch_ptr); + wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); } -out: + BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE); + kunmap_atomic(batch); if (ret) - lrc_destroy_wa_ctx_obj(engine); + lrc_destroy_wa_ctx(engine); return ret; } +static u32 port_seqno(struct execlist_port *port) +{ + return port->request ? port->request->global_seqno : 0; +} + static int gen8_init_common_ring(struct intel_engine_cs *engine) { struct drm_i915_private *dev_priv = engine->i915; @@ -1334,7 +1158,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) I915_WRITE(RING_HWSTAM(engine->mmio_base), 0xffffffff); I915_WRITE(RING_MODE_GEN7(engine), - _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) | _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)); I915_WRITE(RING_HWS_PGA(engine->mmio_base), engine->status_page.ggtt_offset); @@ -1343,7 +1166,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name); /* After a GPU reset, we may have requests to replay */ - if (!execlists_elsp_idle(engine)) { + clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); + if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) { + DRM_DEBUG_DRIVER("Restarting %s from requests [0x%x, 0x%x]\n", + engine->name, + port_seqno(&engine->execlist_port[0]), + port_seqno(&engine->execlist_port[1])); engine->execlist_port[0].count = 0; engine->execlist_port[1].count = 0; execlists_submit_ports(engine); @@ -1388,7 +1216,6 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine) static void reset_common_ring(struct intel_engine_cs *engine, struct drm_i915_gem_request *request) { - struct drm_i915_private *dev_priv = engine->i915; struct execlist_port *port = engine->execlist_port; struct intel_context *ce; @@ -1422,14 +1249,9 @@ static void reset_common_ring(struct intel_engine_cs *engine, ce->lrc_reg_state[CTX_RING_HEAD+1] = request->postfix; request->ring->head = request->postfix; - request->ring->last_retired_head = -1; intel_ring_update_space(request->ring); - if (i915.enable_guc_submission) - return; - /* Catch up with any missed context-switch interrupts */ - I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0)); if (request->ctx != port[0].request->ctx) { i915_gem_request_put(port[0].request); port[0] = port[1]; @@ -1439,43 +1261,45 @@ static void reset_common_ring(struct intel_engine_cs *engine, GEM_BUG_ON(request->ctx != port[0].request->ctx); /* Reset WaIdleLiteRestore:bdw,skl as well */ - request->tail = request->wa_tail - WA_TAIL_DWORDS * sizeof(u32); + request->tail = + intel_ring_wrap(request->ring, + request->wa_tail - WA_TAIL_DWORDS*sizeof(u32)); + assert_ring_tail_valid(request->ring, request->tail); } static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) { struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; - struct intel_ring *ring = req->ring; struct intel_engine_cs *engine = req->engine; - const int num_lri_cmds = GEN8_LEGACY_PDPES * 2; - int i, ret; + const int num_lri_cmds = GEN8_3LVL_PDPES * 2; + u32 *cs; + int i; - ret = intel_ring_begin(req, num_lri_cmds * 2 + 2); - if (ret) - return ret; + cs = intel_ring_begin(req, num_lri_cmds * 2 + 2); + if (IS_ERR(cs)) + return PTR_ERR(cs); - intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(num_lri_cmds)); - for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) { + *cs++ = MI_LOAD_REGISTER_IMM(num_lri_cmds); + for (i = GEN8_3LVL_PDPES - 1; i >= 0; i--) { const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i); - intel_ring_emit_reg(ring, GEN8_RING_PDP_UDW(engine, i)); - intel_ring_emit(ring, upper_32_bits(pd_daddr)); - intel_ring_emit_reg(ring, GEN8_RING_PDP_LDW(engine, i)); - intel_ring_emit(ring, lower_32_bits(pd_daddr)); + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, i)); + *cs++ = upper_32_bits(pd_daddr); + *cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, i)); + *cs++ = lower_32_bits(pd_daddr); } - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); + *cs++ = MI_NOOP; + intel_ring_advance(req, cs); return 0; } static int gen8_emit_bb_start(struct drm_i915_gem_request *req, u64 offset, u32 len, - unsigned int dispatch_flags) + const unsigned int flags) { - struct intel_ring *ring = req->ring; - bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE); + u32 *cs; int ret; /* Don't rely in hw updating PDPs, specially in lite-restore. @@ -1485,30 +1309,28 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req, * not idle). PML4 is allocated during ppgtt init so this is * not needed in 48-bit.*/ if (req->ctx->ppgtt && - (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings)) { - if (!USES_FULL_48BIT_PPGTT(req->i915) && - !intel_vgpu_active(req->i915)) { - ret = intel_logical_ring_emit_pdps(req); - if (ret) - return ret; - } + (intel_engine_flag(req->engine) & req->ctx->ppgtt->pd_dirty_rings) && + !i915_vm_is_48bit(&req->ctx->ppgtt->base) && + !intel_vgpu_active(req->i915)) { + ret = intel_logical_ring_emit_pdps(req); + if (ret) + return ret; req->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine); } - ret = intel_ring_begin(req, 4); - if (ret) - return ret; + cs = intel_ring_begin(req, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); /* FIXME(BDW): Address space and security selectors. */ - intel_ring_emit(ring, MI_BATCH_BUFFER_START_GEN8 | - (ppgtt<<8) | - (dispatch_flags & I915_DISPATCH_RS ? - MI_BATCH_RESOURCE_STREAMER : 0)); - intel_ring_emit(ring, lower_32_bits(offset)); - intel_ring_emit(ring, upper_32_bits(offset)); - intel_ring_emit(ring, MI_NOOP); - intel_ring_advance(ring); + *cs++ = MI_BATCH_BUFFER_START_GEN8 | + (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) | + (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0); + *cs++ = lower_32_bits(offset); + *cs++ = upper_32_bits(offset); + *cs++ = MI_NOOP; + intel_ring_advance(req, cs); return 0; } @@ -1529,13 +1351,11 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine) static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode) { - struct intel_ring *ring = request->ring; - u32 cmd; - int ret; + u32 cmd, *cs; - ret = intel_ring_begin(request, 4); - if (ret) - return ret; + cs = intel_ring_begin(request, 4); + if (IS_ERR(cs)) + return PTR_ERR(cs); cmd = MI_FLUSH_DW + 1; @@ -1552,13 +1372,11 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode) cmd |= MI_INVALIDATE_BSD; } - intel_ring_emit(ring, cmd); - intel_ring_emit(ring, - I915_GEM_HWS_SCRATCH_ADDR | - MI_FLUSH_DW_USE_GTT); - intel_ring_emit(ring, 0); /* upper addr */ - intel_ring_emit(ring, 0); /* value */ - intel_ring_advance(ring); + *cs++ = cmd; + *cs++ = I915_GEM_HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; /* upper addr */ + *cs++ = 0; /* value */ + intel_ring_advance(request, cs); return 0; } @@ -1566,13 +1384,11 @@ static int gen8_emit_flush(struct drm_i915_gem_request *request, u32 mode) static int gen8_emit_flush_render(struct drm_i915_gem_request *request, u32 mode) { - struct intel_ring *ring = request->ring; struct intel_engine_cs *engine = request->engine; u32 scratch_addr = i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES; bool vf_flush_wa = false, dc_flush_wa = false; - u32 flags = 0; - int ret; + u32 *cs, flags = 0; int len; flags |= PIPE_CONTROL_CS_STALL; @@ -1614,62 +1430,25 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request, if (dc_flush_wa) len += 12; - ret = intel_ring_begin(request, len); - if (ret) - return ret; + cs = intel_ring_begin(request, len); + if (IS_ERR(cs)) + return PTR_ERR(cs); - if (vf_flush_wa) { - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - } + if (vf_flush_wa) + cs = gen8_emit_pipe_control(cs, 0, 0); - if (dc_flush_wa) { - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, PIPE_CONTROL_DC_FLUSH_ENABLE); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - } + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE, + 0); - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, flags); - intel_ring_emit(ring, scratch_addr); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - - if (dc_flush_wa) { - intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6)); - intel_ring_emit(ring, PIPE_CONTROL_CS_STALL); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - intel_ring_emit(ring, 0); - } + cs = gen8_emit_pipe_control(cs, flags, scratch_addr); - intel_ring_advance(ring); + if (dc_flush_wa) + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0); - return 0; -} + intel_ring_advance(request, cs); -static void bxt_a_seqno_barrier(struct intel_engine_cs *engine) -{ - /* - * On BXT A steppings there is a HW coherency issue whereby the - * MI_STORE_DATA_IMM storing the completed request's seqno - * occasionally doesn't invalidate the CPU cache. Work around this by - * clflushing the corresponding cacheline whenever the caller wants - * the coherency to be guaranteed. Note that this cacheline is known - * to be clean at this point, since we only write it in - * bxt_a_set_seqno(), where we also do a clflush after the write. So - * this clflush in practice becomes an invalidate operation. - */ - intel_flush_status_page(engine, I915_GEM_HWS_INDEX); + return 0; } /* @@ -1677,34 +1456,34 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine) * used as a workaround for not being allowed to do lite * restore with HEAD==TAIL (WaIdleLiteRestore). */ -static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out) +static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs) { - *out++ = MI_NOOP; - *out++ = MI_NOOP; - request->wa_tail = intel_ring_offset(request->ring, out); + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + request->wa_tail = intel_ring_offset(request, cs); } -static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, - u32 *out) +static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs) { /* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */ BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5)); - *out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; - *out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT; - *out++ = 0; - *out++ = request->global_seqno; - *out++ = MI_USER_INTERRUPT; - *out++ = MI_NOOP; - request->tail = intel_ring_offset(request->ring, out); + *cs++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW; + *cs++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = request->global_seqno; + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); - gen8_emit_wa_tail(request, out); + gen8_emit_wa_tail(request, cs); } static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, - u32 *out) + u32 *cs) { /* We're using qword write, seqno should be aligned to 8 bytes. */ BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1); @@ -1713,20 +1492,20 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, * need a prior CS_STALL, which is emitted by the flush * following the batch. */ - *out++ = GFX_OP_PIPE_CONTROL(6); - *out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB | - PIPE_CONTROL_CS_STALL | - PIPE_CONTROL_QW_WRITE); - *out++ = intel_hws_seqno_address(request->engine); - *out++ = 0; - *out++ = request->global_seqno; + *cs++ = GFX_OP_PIPE_CONTROL(6); + *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_QW_WRITE; + *cs++ = intel_hws_seqno_address(request->engine); + *cs++ = 0; + *cs++ = request->global_seqno; /* We're thrashing one dword of HWS. */ - *out++ = 0; - *out++ = MI_USER_INTERRUPT; - *out++ = MI_NOOP; - request->tail = intel_ring_offset(request->ring, out); + *cs++ = 0; + *cs++ = MI_USER_INTERRUPT; + *cs++ = MI_NOOP; + request->tail = intel_ring_offset(request, cs); + assert_ring_tail_valid(request->ring, request->tail); - gen8_emit_wa_tail(request, out); + gen8_emit_wa_tail(request, cs); } static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; @@ -1735,7 +1514,7 @@ static int gen8_init_rcs_context(struct drm_i915_gem_request *req) { int ret; - ret = intel_logical_ring_workarounds_emit(req); + ret = intel_ring_workarounds_emit(req); if (ret) return ret; @@ -1781,21 +1560,17 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine) intel_engine_cleanup_common(engine); - lrc_destroy_wa_ctx_obj(engine); + lrc_destroy_wa_ctx(engine); engine->i915 = NULL; dev_priv->engine[engine->id] = NULL; kfree(engine); } -void intel_execlists_enable_submission(struct drm_i915_private *dev_priv) +static void execlists_set_default_submission(struct intel_engine_cs *engine) { - struct intel_engine_cs *engine; - enum intel_engine_id id; - - for_each_engine(engine, dev_priv, id) { - engine->submit_request = execlists_submit_request; - engine->schedule = execlists_schedule; - } + engine->submit_request = execlists_submit_request; + engine->schedule = execlists_schedule; + engine->irq_tasklet.func = intel_lrc_irq_handler; } static void @@ -1813,14 +1588,12 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine) engine->emit_flush = gen8_emit_flush; engine->emit_breadcrumb = gen8_emit_breadcrumb; engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz; - engine->submit_request = execlists_submit_request; - engine->schedule = execlists_schedule; + + engine->set_default_submission = execlists_set_default_submission; engine->irq_enable = gen8_logical_ring_enable_irq; engine->irq_disable = gen8_logical_ring_disable_irq; engine->emit_bb_start = gen8_emit_bb_start; - if (IS_BXT_REVID(engine->i915, 0, BXT_REVID_A1)) - engine->irq_seqno_barrier = bxt_a_seqno_barrier; } static inline void @@ -1877,7 +1650,6 @@ logical_ring_setup(struct intel_engine_cs *engine) tasklet_init(&engine->irq_tasklet, intel_lrc_irq_handler, (unsigned long)engine); - logical_ring_init_platform_invariants(engine); logical_ring_default_vfuncs(engine); logical_ring_default_irqs(engine); } @@ -2015,105 +1787,89 @@ static u32 intel_lr_indirect_ctx_offset(struct intel_engine_cs *engine) return indirect_ctx_offset; } -static void execlists_init_reg_state(u32 *reg_state, +static void execlists_init_reg_state(u32 *regs, struct i915_gem_context *ctx, struct intel_engine_cs *engine, struct intel_ring *ring) { struct drm_i915_private *dev_priv = engine->i915; struct i915_hw_ppgtt *ppgtt = ctx->ppgtt ?: dev_priv->mm.aliasing_ppgtt; + u32 base = engine->mmio_base; + bool rcs = engine->id == RCS; + + /* A context is actually a big batch buffer with several + * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The + * values we are setting here are only for the first context restore: + * on a subsequent save, the GPU will recreate this batchbuffer with new + * values (including all the missing MI_LOAD_REGISTER_IMM commands that + * we are not initializing here). + */ + regs[CTX_LRI_HEADER_0] = MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) | + MI_LRI_FORCE_POSTED; + + CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(engine), + _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | + CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | + (HAS_RESOURCE_STREAMER(dev_priv) ? + CTX_CTRL_RS_CTX_ENABLE : 0))); + CTX_REG(regs, CTX_RING_HEAD, RING_HEAD(base), 0); + CTX_REG(regs, CTX_RING_TAIL, RING_TAIL(base), 0); + CTX_REG(regs, CTX_RING_BUFFER_START, RING_START(base), 0); + CTX_REG(regs, CTX_RING_BUFFER_CONTROL, RING_CTL(base), + RING_CTL_SIZE(ring->size) | RING_VALID); + CTX_REG(regs, CTX_BB_HEAD_U, RING_BBADDR_UDW(base), 0); + CTX_REG(regs, CTX_BB_HEAD_L, RING_BBADDR(base), 0); + CTX_REG(regs, CTX_BB_STATE, RING_BBSTATE(base), RING_BB_PPGTT); + CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0); + CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); + CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); + if (rcs) { + CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); + CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); + CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, + RING_INDIRECT_CTX_OFFSET(base), 0); - /* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM - * commands followed by (reg, value) pairs. The values we are setting here are - * only for the first context restore: on a subsequent save, the GPU will - * recreate this batchbuffer with new values (including all the missing - * MI_LOAD_REGISTER_IMM commands that we are not initializing here). */ - reg_state[CTX_LRI_HEADER_0] = - MI_LOAD_REGISTER_IMM(engine->id == RCS ? 14 : 11) | MI_LRI_FORCE_POSTED; - ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, - RING_CONTEXT_CONTROL(engine), - _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH | - CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | - (HAS_RESOURCE_STREAMER(dev_priv) ? - CTX_CTRL_RS_CTX_ENABLE : 0))); - ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(engine->mmio_base), - 0); - ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(engine->mmio_base), - 0); - ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, - RING_START(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, - RING_CTL(engine->mmio_base), - RING_CTL_SIZE(ring->size) | RING_VALID); - ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, - RING_BBADDR_UDW(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, - RING_BBADDR(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, - RING_BBSTATE(engine->mmio_base), - RING_BB_PPGTT); - ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, - RING_SBBADDR_UDW(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, - RING_SBBADDR(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, - RING_SBBSTATE(engine->mmio_base), 0); - if (engine->id == RCS) { - ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, - RING_BB_PER_CTX_PTR(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, - RING_INDIRECT_CTX(engine->mmio_base), 0); - ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, - RING_INDIRECT_CTX_OFFSET(engine->mmio_base), 0); if (engine->wa_ctx.vma) { struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); - reg_state[CTX_RCS_INDIRECT_CTX+1] = - (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | - (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); + regs[CTX_RCS_INDIRECT_CTX + 1] = + (ggtt_offset + wa_ctx->indirect_ctx.offset) | + (wa_ctx->indirect_ctx.size / CACHELINE_BYTES); - reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = + regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = intel_lr_indirect_ctx_offset(engine) << 6; - reg_state[CTX_BB_PER_CTX_PTR+1] = - (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | - 0x01; + regs[CTX_BB_PER_CTX_PTR + 1] = + (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; } } - reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; - ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, - RING_CTX_TIMESTAMP(engine->mmio_base), 0); + + regs[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) | MI_LRI_FORCE_POSTED; + + CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0); /* PDP values well be assigned later if needed */ - ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), - 0); - ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), - 0); - - if (ppgtt && USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) { + CTX_REG(regs, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(engine, 3), 0); + CTX_REG(regs, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(engine, 3), 0); + CTX_REG(regs, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(engine, 2), 0); + CTX_REG(regs, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(engine, 2), 0); + CTX_REG(regs, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(engine, 1), 0); + CTX_REG(regs, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(engine, 1), 0); + CTX_REG(regs, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(engine, 0), 0); + CTX_REG(regs, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(engine, 0), 0); + + if (ppgtt && i915_vm_is_48bit(&ppgtt->base)) { /* 64b PPGTT (48bit canonical) * PDP0_DESCRIPTOR contains the base address to PML4 and * other PDP Descriptors are ignored. */ - ASSIGN_CTX_PML4(ppgtt, reg_state); + ASSIGN_CTX_PML4(ppgtt, regs); } - if (engine->id == RCS) { - reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); - ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, - make_rpcs(dev_priv)); + if (rcs) { + regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); + CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, + make_rpcs(dev_priv)); } } @@ -2281,7 +2037,6 @@ void intel_lr_context_resume(struct drm_i915_private *dev_priv) i915_gem_object_unpin_map(ce->state->obj); ce->ring->head = ce->ring->tail = 0; - ce->ring->last_retired_head = -1; intel_ring_update_space(ce->ring); } } |