diff options
Diffstat (limited to 'drivers/gpu/drm/i915/intel_lrc.c')
-rw-r--r-- | drivers/gpu/drm/i915/intel_lrc.c | 707 |
1 files changed, 441 insertions, 266 deletions
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 6f972e6ec663..d36e25607435 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -208,8 +208,9 @@ /* Typical size of the average request (2 pipecontrols and a MI_BB) */ #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ - #define WA_TAIL_DWORDS 2 +#define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS) +#define PREEMPT_ID 0x1 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, struct intel_engine_cs *engine); @@ -243,8 +244,7 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv, int enabl return 0; if (HAS_LOGICAL_RING_CONTEXTS(dev_priv) && - USES_PPGTT(dev_priv) && - i915.use_mmio_flip >= 0) + USES_PPGTT(dev_priv)) return 1; return 0; @@ -279,17 +279,110 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx, BUILD_BUG_ON(MAX_CONTEXT_HW_ID > (1<<GEN8_CTX_ID_WIDTH)); desc = ctx->desc_template; /* bits 0-11 */ - desc |= i915_ggtt_offset(ce->state) + LRC_PPHWSP_PN * PAGE_SIZE; + desc |= i915_ggtt_offset(ce->state) + LRC_HEADER_PAGES * PAGE_SIZE; /* bits 12-31 */ desc |= (u64)ctx->hw_id << GEN8_CTX_ID_SHIFT; /* bits 32-52 */ ce->lrc_desc = desc; } -uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx, - struct intel_engine_cs *engine) +static struct i915_priolist * +lookup_priolist(struct intel_engine_cs *engine, + struct i915_priotree *pt, + int prio) +{ + struct intel_engine_execlists * const execlists = &engine->execlists; + struct i915_priolist *p; + struct rb_node **parent, *rb; + bool first = true; + + if (unlikely(execlists->no_priolist)) + prio = I915_PRIORITY_NORMAL; + +find_priolist: + /* most positive priority is scheduled first, equal priorities fifo */ + rb = NULL; + parent = &execlists->queue.rb_node; + while (*parent) { + rb = *parent; + p = rb_entry(rb, typeof(*p), node); + if (prio > p->priority) { + parent = &rb->rb_left; + } else if (prio < p->priority) { + parent = &rb->rb_right; + first = false; + } else { + return p; + } + } + + if (prio == I915_PRIORITY_NORMAL) { + p = &execlists->default_priolist; + } else { + p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC); + /* Convert an allocation failure to a priority bump */ + if (unlikely(!p)) { + prio = I915_PRIORITY_NORMAL; /* recurses just once */ + + /* To maintain ordering with all rendering, after an + * allocation failure we have to disable all scheduling. + * Requests will then be executed in fifo, and schedule + * will ensure that dependencies are emitted in fifo. + * There will be still some reordering with existing + * requests, so if userspace lied about their + * dependencies that reordering may be visible. + */ + execlists->no_priolist = true; + goto find_priolist; + } + } + + p->priority = prio; + INIT_LIST_HEAD(&p->requests); + rb_link_node(&p->node, rb, parent); + rb_insert_color(&p->node, &execlists->queue); + + if (first) + execlists->first = &p->node; + + return ptr_pack_bits(p, first, 1); +} + +static void unwind_wa_tail(struct drm_i915_gem_request *rq) { - return ctx->engine[engine->id].lrc_desc; + rq->tail = intel_ring_wrap(rq->ring, rq->wa_tail - WA_TAIL_BYTES); + assert_ring_tail_valid(rq->ring, rq->tail); +} + +static void unwind_incomplete_requests(struct intel_engine_cs *engine) +{ + struct drm_i915_gem_request *rq, *rn; + struct i915_priolist *uninitialized_var(p); + int last_prio = I915_PRIORITY_INVALID; + + lockdep_assert_held(&engine->timeline->lock); + + list_for_each_entry_safe_reverse(rq, rn, + &engine->timeline->requests, + link) { + if (i915_gem_request_completed(rq)) + return; + + __i915_gem_request_unsubmit(rq); + unwind_wa_tail(rq); + + GEM_BUG_ON(rq->priotree.priority == I915_PRIORITY_INVALID); + if (rq->priotree.priority != last_prio) { + p = lookup_priolist(engine, + &rq->priotree, + rq->priotree.priority); + p = ptr_mask_bits(p, 1); + + last_prio = rq->priotree.priority; + } + + list_add(&rq->priotree.link, &p->requests); + } } static inline void @@ -336,14 +429,20 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq) return ce->lrc_desc; } +static inline void elsp_write(u64 desc, u32 __iomem *elsp) +{ + writel(upper_32_bits(desc), elsp); + writel(lower_32_bits(desc), elsp); +} + static void execlists_submit_ports(struct intel_engine_cs *engine) { - struct execlist_port *port = engine->execlist_port; + struct execlist_port *port = engine->execlists.port; u32 __iomem *elsp = engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); unsigned int n; - for (n = ARRAY_SIZE(engine->execlist_port); n--; ) { + for (n = execlists_num_ports(&engine->execlists); n--; ) { struct drm_i915_gem_request *rq; unsigned int count; u64 desc; @@ -361,8 +460,7 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) desc = 0; } - writel(upper_32_bits(desc), elsp); - writel(lower_32_bits(desc), elsp); + elsp_write(desc, elsp); } } @@ -395,25 +493,43 @@ static void port_assign(struct execlist_port *port, port_set(port, port_pack(i915_gem_request_get(rq), port_count(port))); } +static void inject_preempt_context(struct intel_engine_cs *engine) +{ + struct intel_context *ce = + &engine->i915->preempt_context->engine[engine->id]; + u32 __iomem *elsp = + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); + unsigned int n; + + GEM_BUG_ON(engine->i915->preempt_context->hw_id != PREEMPT_ID); + GEM_BUG_ON(!IS_ALIGNED(ce->ring->size, WA_TAIL_BYTES)); + + memset(ce->ring->vaddr + ce->ring->tail, 0, WA_TAIL_BYTES); + ce->ring->tail += WA_TAIL_BYTES; + ce->ring->tail &= (ce->ring->size - 1); + ce->lrc_reg_state[CTX_RING_TAIL+1] = ce->ring->tail; + + for (n = execlists_num_ports(&engine->execlists); --n; ) + elsp_write(0, elsp); + + elsp_write(ce->lrc_desc, elsp); +} + +static bool can_preempt(struct intel_engine_cs *engine) +{ + return INTEL_INFO(engine->i915)->has_logical_ring_preemption; +} + static void execlists_dequeue(struct intel_engine_cs *engine) { - struct drm_i915_gem_request *last; - struct execlist_port *port = engine->execlist_port; + struct intel_engine_execlists * const execlists = &engine->execlists; + struct execlist_port *port = execlists->port; + const struct execlist_port * const last_port = + &execlists->port[execlists->port_mask]; + struct drm_i915_gem_request *last = port_request(port); struct rb_node *rb; bool submit = false; - last = port_request(port); - if (last) - /* WaIdleLiteRestore:bdw,skl - * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL - * as we resubmit the request. See gen8_emit_breadcrumb() - * for where we prepare the padding after the end of the - * request. - */ - last->tail = last->wa_tail; - - GEM_BUG_ON(port_isset(&port[1])); - /* Hardware submission is through 2 ports. Conceptually each port * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is * static for a context, and unique to each, so we only execute @@ -436,9 +552,68 @@ static void execlists_dequeue(struct intel_engine_cs *engine) */ spin_lock_irq(&engine->timeline->lock); - rb = engine->execlist_first; - GEM_BUG_ON(rb_first(&engine->execlist_queue) != rb); - while (rb) { + rb = execlists->first; + GEM_BUG_ON(rb_first(&execlists->queue) != rb); + if (!rb) + goto unlock; + + if (last) { + /* + * Don't resubmit or switch until all outstanding + * preemptions (lite-restore) are seen. Then we + * know the next preemption status we see corresponds + * to this ELSP update. + */ + if (port_count(&port[0]) > 1) + goto unlock; + + if (can_preempt(engine) && + rb_entry(rb, struct i915_priolist, node)->priority > + max(last->priotree.priority, 0)) { + /* + * Switch to our empty preempt context so + * the state of the GPU is known (idle). + */ + inject_preempt_context(engine); + execlists_set_active(execlists, + EXECLISTS_ACTIVE_PREEMPT); + goto unlock; + } else { + /* + * In theory, we could coalesce more requests onto + * the second port (the first port is active, with + * no preemptions pending). However, that means we + * then have to deal with the possible lite-restore + * of the second port (as we submit the ELSP, there + * may be a context-switch) but also we may complete + * the resubmission before the context-switch. Ergo, + * coalescing onto the second port will cause a + * preemption event, but we cannot predict whether + * that will affect port[0] or port[1]. + * + * If the second port is already active, we can wait + * until the next context-switch before contemplating + * new requests. The GPU will be busy and we should be + * able to resubmit the new ELSP before it idles, + * avoiding pipeline bubbles (momentary pauses where + * the driver is unable to keep up the supply of new + * work). + */ + if (port_count(&port[1])) + goto unlock; + + /* WaIdleLiteRestore:bdw,skl + * Apply the wa NOOPs to prevent + * ring:HEAD == req:TAIL as we resubmit the + * request. See gen8_emit_breadcrumb() for + * where we prepare the padding after the + * end of the request. + */ + last->tail = last->wa_tail; + } + } + + do { struct i915_priolist *p = rb_entry(rb, typeof(*p), node); struct drm_i915_gem_request *rq, *rn; @@ -460,7 +635,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * combine this request with the last, then we * are done. */ - if (port != engine->execlist_port) { + if (port == last_port) { __list_del_many(&p->requests, &rq->priotree.link); goto done; @@ -485,38 +660,108 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (submit) port_assign(port, last); port++; + + GEM_BUG_ON(port_isset(port)); } INIT_LIST_HEAD(&rq->priotree.link); - rq->priotree.priority = INT_MAX; - __i915_gem_request_submit(rq); - trace_i915_gem_request_in(rq, port_index(port, engine)); + trace_i915_gem_request_in(rq, port_index(port, execlists)); last = rq; submit = true; } rb = rb_next(rb); - rb_erase(&p->node, &engine->execlist_queue); + rb_erase(&p->node, &execlists->queue); INIT_LIST_HEAD(&p->requests); if (p->priority != I915_PRIORITY_NORMAL) kmem_cache_free(engine->i915->priorities, p); - } + } while (rb); done: - engine->execlist_first = rb; + execlists->first = rb; if (submit) port_assign(port, last); +unlock: spin_unlock_irq(&engine->timeline->lock); - if (submit) + if (submit) { + execlists_set_active(execlists, EXECLISTS_ACTIVE_USER); execlists_submit_ports(engine); + } +} + +static void +execlist_cancel_port_requests(struct intel_engine_execlists *execlists) +{ + struct execlist_port *port = execlists->port; + unsigned int num_ports = execlists_num_ports(execlists); + + while (num_ports-- && port_isset(port)) { + struct drm_i915_gem_request *rq = port_request(port); + + GEM_BUG_ON(!execlists->active); + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_PREEMPTED); + i915_gem_request_put(rq); + + memset(port, 0, sizeof(*port)); + port++; + } } -static bool execlists_elsp_ready(const struct intel_engine_cs *engine) +static void execlists_cancel_requests(struct intel_engine_cs *engine) { - const struct execlist_port *port = engine->execlist_port; + struct intel_engine_execlists * const execlists = &engine->execlists; + struct drm_i915_gem_request *rq, *rn; + struct rb_node *rb; + unsigned long flags; - return port_count(&port[0]) + port_count(&port[1]) < 2; + spin_lock_irqsave(&engine->timeline->lock, flags); + + /* Cancel the requests on the HW and clear the ELSP tracker. */ + execlist_cancel_port_requests(execlists); + + /* Mark all executing requests as skipped. */ + list_for_each_entry(rq, &engine->timeline->requests, link) { + GEM_BUG_ON(!rq->global_seqno); + if (!i915_gem_request_completed(rq)) + dma_fence_set_error(&rq->fence, -EIO); + } + + /* Flush the queued requests to the timeline list (for retiring). */ + rb = execlists->first; + while (rb) { + struct i915_priolist *p = rb_entry(rb, typeof(*p), node); + + list_for_each_entry_safe(rq, rn, &p->requests, priotree.link) { + INIT_LIST_HEAD(&rq->priotree.link); + + dma_fence_set_error(&rq->fence, -EIO); + __i915_gem_request_submit(rq); + } + + rb = rb_next(rb); + rb_erase(&p->node, &execlists->queue); + INIT_LIST_HEAD(&p->requests); + if (p->priority != I915_PRIORITY_NORMAL) + kmem_cache_free(engine->i915->priorities, p); + } + + /* Remaining _unready_ requests will be nop'ed when submitted */ + + + execlists->queue = RB_ROOT; + execlists->first = NULL; + GEM_BUG_ON(port_isset(execlists->port)); + + /* + * The port is checked prior to scheduling a tasklet, but + * just in case we have suspended the tasklet to do the + * wedging make sure that when it wakes, it decides there + * is no work to do by clearing the irq_posted bit. + */ + clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); + + spin_unlock_irqrestore(&engine->timeline->lock, flags); } /* @@ -525,8 +770,9 @@ static bool execlists_elsp_ready(const struct intel_engine_cs *engine) */ static void intel_lrc_irq_handler(unsigned long data) { - struct intel_engine_cs *engine = (struct intel_engine_cs *)data; - struct execlist_port *port = engine->execlist_port; + struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; + struct intel_engine_execlists * const execlists = &engine->execlists; + struct execlist_port * const port = execlists->port; struct drm_i915_private *dev_priv = engine->i915; /* We can skip acquiring intel_runtime_pm_get() here as it was taken @@ -538,19 +784,24 @@ static void intel_lrc_irq_handler(unsigned long data) */ GEM_BUG_ON(!dev_priv->gt.awake); - intel_uncore_forcewake_get(dev_priv, engine->fw_domains); + intel_uncore_forcewake_get(dev_priv, execlists->fw_domains); /* Prefer doing test_and_clear_bit() as a two stage operation to avoid * imposing the cost of a locked atomic transaction when submitting a * new request (outside of the context-switch interrupt). */ while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) { - u32 __iomem *csb_mmio = - dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)); - u32 __iomem *buf = - dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)); + /* The HWSP contains a (cacheable) mirror of the CSB */ + const u32 *buf = + &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX]; unsigned int head, tail; + if (unlikely(execlists->csb_use_mmio)) { + buf = (u32 * __force) + (dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0))); + execlists->csb_head = -1; /* force mmio read of CSB ptrs */ + } + /* The write will be ordered by the uncached read (itself * a memory barrier), so we do not need another in the form * of a locked instruction. The race between the interrupt @@ -562,9 +813,20 @@ static void intel_lrc_irq_handler(unsigned long data) * is set and we do a new loop. */ __clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); - head = readl(csb_mmio); - tail = GEN8_CSB_WRITE_PTR(head); - head = GEN8_CSB_READ_PTR(head); + if (unlikely(execlists->csb_head == -1)) { /* following a reset */ + head = readl(dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine))); + tail = GEN8_CSB_WRITE_PTR(head); + head = GEN8_CSB_READ_PTR(head); + execlists->csb_head = head; + } else { + const int write_idx = + intel_hws_csb_write_index(dev_priv) - + I915_HWS_CSB_BUF0_INDEX; + + head = execlists->csb_head; + tail = READ_ONCE(buf[write_idx]); + } + while (head != tail) { struct drm_i915_gem_request *rq; unsigned int status; @@ -590,13 +852,35 @@ static void intel_lrc_irq_handler(unsigned long data) * status notifier. */ - status = readl(buf + 2 * head); + status = READ_ONCE(buf[2 * head]); /* maybe mmio! */ if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) continue; + if (status & GEN8_CTX_STATUS_ACTIVE_IDLE && + buf[2*head + 1] == PREEMPT_ID) { + execlist_cancel_port_requests(execlists); + + spin_lock_irq(&engine->timeline->lock); + unwind_incomplete_requests(engine); + spin_unlock_irq(&engine->timeline->lock); + + GEM_BUG_ON(!execlists_is_active(execlists, + EXECLISTS_ACTIVE_PREEMPT)); + execlists_clear_active(execlists, + EXECLISTS_ACTIVE_PREEMPT); + continue; + } + + if (status & GEN8_CTX_STATUS_PREEMPTED && + execlists_is_active(execlists, + EXECLISTS_ACTIVE_PREEMPT)) + continue; + + GEM_BUG_ON(!execlists_is_active(execlists, + EXECLISTS_ACTIVE_USER)); + /* Check the context/desc id for this event matches */ - GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != - port->context_id); + GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); rq = port_unpack(port, &count); GEM_BUG_ON(count == 0); @@ -608,8 +892,7 @@ static void intel_lrc_irq_handler(unsigned long data) trace_i915_gem_request_out(rq); i915_gem_request_put(rq); - port[0] = port[1]; - memset(&port[1], 0, sizeof(port[1])); + execlists_port_complete(execlists, port); } else { port_set(port, port_pack(rq, count)); } @@ -617,80 +900,33 @@ static void intel_lrc_irq_handler(unsigned long data) /* After the final element, the hw should be idle */ GEM_BUG_ON(port_count(port) == 0 && !(status & GEN8_CTX_STATUS_ACTIVE_IDLE)); + if (port_count(port) == 0) + execlists_clear_active(execlists, + EXECLISTS_ACTIVE_USER); } - writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8), - csb_mmio); + if (head != execlists->csb_head) { + execlists->csb_head = head; + writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK, head << 8), + dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine))); + } } - if (execlists_elsp_ready(engine)) + if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)) execlists_dequeue(engine); - intel_uncore_forcewake_put(dev_priv, engine->fw_domains); + intel_uncore_forcewake_put(dev_priv, execlists->fw_domains); } -static bool -insert_request(struct intel_engine_cs *engine, - struct i915_priotree *pt, - int prio) +static void insert_request(struct intel_engine_cs *engine, + struct i915_priotree *pt, + int prio) { - struct i915_priolist *p; - struct rb_node **parent, *rb; - bool first = true; - - if (unlikely(engine->no_priolist)) - prio = I915_PRIORITY_NORMAL; - -find_priolist: - /* most positive priority is scheduled first, equal priorities fifo */ - rb = NULL; - parent = &engine->execlist_queue.rb_node; - while (*parent) { - rb = *parent; - p = rb_entry(rb, typeof(*p), node); - if (prio > p->priority) { - parent = &rb->rb_left; - } else if (prio < p->priority) { - parent = &rb->rb_right; - first = false; - } else { - list_add_tail(&pt->link, &p->requests); - return false; - } - } - - if (prio == I915_PRIORITY_NORMAL) { - p = &engine->default_priolist; - } else { - p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC); - /* Convert an allocation failure to a priority bump */ - if (unlikely(!p)) { - prio = I915_PRIORITY_NORMAL; /* recurses just once */ - - /* To maintain ordering with all rendering, after an - * allocation failure we have to disable all scheduling. - * Requests will then be executed in fifo, and schedule - * will ensure that dependencies are emitted in fifo. - * There will be still some reordering with existing - * requests, so if userspace lied about their - * dependencies that reordering may be visible. - */ - engine->no_priolist = true; - goto find_priolist; - } - } + struct i915_priolist *p = lookup_priolist(engine, pt, prio); - p->priority = prio; - rb_link_node(&p->node, rb, parent); - rb_insert_color(&p->node, &engine->execlist_queue); - - INIT_LIST_HEAD(&p->requests); - list_add_tail(&pt->link, &p->requests); - - if (first) - engine->execlist_first = &p->node; - - return first; + list_add_tail(&pt->link, &ptr_mask_bits(p, 1)->requests); + if (ptr_unmask_bits(p, 1)) + tasklet_hi_schedule(&engine->execlists.irq_tasklet); } static void execlists_submit_request(struct drm_i915_gem_request *request) @@ -701,24 +937,23 @@ static void execlists_submit_request(struct drm_i915_gem_request *request) /* Will be called from irq-context when using foreign fences. */ spin_lock_irqsave(&engine->timeline->lock, flags); - if (insert_request(engine, - &request->priotree, - request->priotree.priority)) { - if (execlists_elsp_ready(engine)) - tasklet_hi_schedule(&engine->irq_tasklet); - } + insert_request(engine, &request->priotree, request->priotree.priority); - GEM_BUG_ON(!engine->execlist_first); + GEM_BUG_ON(!engine->execlists.first); GEM_BUG_ON(list_empty(&request->priotree.link)); spin_unlock_irqrestore(&engine->timeline->lock, flags); } +static struct drm_i915_gem_request *pt_to_request(struct i915_priotree *pt) +{ + return container_of(pt, struct drm_i915_gem_request, priotree); +} + static struct intel_engine_cs * pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked) { - struct intel_engine_cs *engine = - container_of(pt, struct drm_i915_gem_request, priotree)->engine; + struct intel_engine_cs *engine = pt_to_request(pt)->engine; GEM_BUG_ON(!locked); @@ -737,6 +972,8 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) struct i915_dependency stack; LIST_HEAD(dfs); + GEM_BUG_ON(prio == I915_PRIORITY_INVALID); + if (prio <= READ_ONCE(request->priotree.priority)) return; @@ -772,6 +1009,9 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) * engines. */ list_for_each_entry(p, &pt->signalers_list, signal_link) { + if (i915_gem_request_completed(pt_to_request(p->signaler))) + continue; + GEM_BUG_ON(p->signaler->priority < pt->priority); if (prio > READ_ONCE(p->signaler->priority)) list_move_tail(&p->dfs_link, &dfs); @@ -785,7 +1025,7 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) * execlists_submit_request()), we can set our own priority and skip * acquiring the engine locks. */ - if (request->priotree.priority == INT_MIN) { + if (request->priotree.priority == I915_PRIORITY_INVALID) { GEM_BUG_ON(!list_empty(&request->priotree.link)); request->priotree.priority = prio; if (stack.dfs_link.next == stack.dfs_link.prev) @@ -815,8 +1055,6 @@ static void execlists_schedule(struct drm_i915_gem_request *request, int prio) } spin_unlock_irq(&engine->timeline->lock); - - /* XXX Do we need to preempt to make room for us and our deps? */ } static struct intel_ring * @@ -866,6 +1104,7 @@ execlists_context_pin(struct intel_engine_cs *engine, i915_ggtt_offset(ce->ring->vma); ce->state->obj->mm.dirty = true; + ce->state->obj->pin_global++; i915_gem_context_get(ctx); out: @@ -893,6 +1132,7 @@ static void execlists_context_unpin(struct intel_engine_cs *engine, intel_ring_unpin(ce->ring); + ce->state->obj->pin_global--; i915_gem_object_unpin_map(ce->state->obj); i915_vma_unpin(ce->state); @@ -914,27 +1154,14 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request) */ request->reserved_space += EXECLISTS_REQUEST_SIZE; - if (i915.enable_guc_submission) { - /* - * Check that the GuC has space for the request before - * going any further, as the i915_add_request() call - * later on mustn't fail ... - */ - ret = i915_guc_wq_reserve(request); - if (ret) - goto err; - } - cs = intel_ring_begin(request, 0); - if (IS_ERR(cs)) { - ret = PTR_ERR(cs); - goto err_unreserve; - } + if (IS_ERR(cs)) + return PTR_ERR(cs); if (!ce->initialised) { ret = engine->init_context(request); if (ret) - goto err_unreserve; + return ret; ce->initialised = true; } @@ -948,12 +1175,6 @@ static int execlists_request_alloc(struct drm_i915_gem_request *request) request->reserved_space -= EXECLISTS_REQUEST_SIZE; return 0; - -err_unreserve: - if (i915.enable_guc_submission) - i915_guc_wq_unreserve(request); -err: - return ret; } /* @@ -1031,6 +1252,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES); + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* Pad to end of cacheline */ while ((unsigned long)batch % CACHELINE_BYTES) *batch++ = MI_NOOP; @@ -1044,26 +1267,10 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) return batch; } -/* - * This batch is started immediately after indirect_ctx batch. Since we ensure - * that indirect_ctx ends on a cacheline this batch is aligned automatically. - * - * The number of DWORDS written are returned using this field. - * - * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding - * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. - */ -static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) -{ - /* WaDisableCtxRestoreArbitration:bdw,chv */ - *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; - *batch++ = MI_BATCH_BUFFER_END; - - return batch; -} - static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) { + *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE; + /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */ batch = gen8_emit_flush_coherentl3_wa(engine, batch); @@ -1109,6 +1316,8 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) *batch++ = 0; } + *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* Pad to end of cacheline */ while ((unsigned long)batch % CACHELINE_BYTES) *batch++ = MI_NOOP; @@ -1116,13 +1325,6 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) return batch; } -static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) -{ - *batch++ = MI_BATCH_BUFFER_END; - - return batch; -} - #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE) static int lrc_setup_wa_ctx(struct intel_engine_cs *engine) @@ -1175,13 +1377,15 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) return -EINVAL; switch (INTEL_GEN(engine->i915)) { + case 10: + return 0; case 9: wa_bb_fn[0] = gen9_init_indirectctx_bb; - wa_bb_fn[1] = gen9_init_perctx_bb; + wa_bb_fn[1] = NULL; break; case 8: wa_bb_fn[0] = gen8_init_indirectctx_bb; - wa_bb_fn[1] = gen8_init_perctx_bb; + wa_bb_fn[1] = NULL; break; default: MISSING_CASE(INTEL_GEN(engine->i915)); @@ -1208,7 +1412,8 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) ret = -EINVAL; break; } - batch_ptr = wa_bb_fn[i](engine, batch_ptr); + if (wa_bb_fn[i]) + batch_ptr = wa_bb_fn[i](engine, batch_ptr); wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset); } @@ -1232,9 +1437,7 @@ static u8 gtiir[] = { static int gen8_init_common_ring(struct intel_engine_cs *engine) { struct drm_i915_private *dev_priv = engine->i915; - struct execlist_port *port = engine->execlist_port; - unsigned int n; - bool submit; + struct intel_engine_execlists * const execlists = &engine->execlists; int ret; ret = intel_mocs_init_engine(engine); @@ -1267,24 +1470,12 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine) I915_WRITE(GEN8_GT_IIR(gtiir[engine->id]), GT_CONTEXT_SWITCH_INTERRUPT << engine->irq_shift); clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted); + execlists->csb_head = -1; + execlists->active = 0; /* After a GPU reset, we may have requests to replay */ - submit = false; - for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) { - if (!port_isset(&port[n])) - break; - - DRM_DEBUG_DRIVER("Restarting %s:%d from 0x%x\n", - engine->name, n, - port_request(&port[n])->global_seqno); - - /* Discard the current inflight count */ - port_set(&port[n], port_request(&port[n])); - submit = true; - } - - if (submit && !i915.enable_guc_submission) - execlists_submit_ports(engine); + if (!i915_modparams.enable_guc_submission && execlists->first) + tasklet_schedule(&execlists->irq_tasklet); return 0; } @@ -1325,9 +1516,11 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine) static void reset_common_ring(struct intel_engine_cs *engine, struct drm_i915_gem_request *request) { - struct execlist_port *port = engine->execlist_port; + struct intel_engine_execlists * const execlists = &engine->execlists; struct intel_context *ce; - unsigned int n; + unsigned long flags; + + spin_lock_irqsave(&engine->timeline->lock, flags); /* * Catch up with any missed context-switch interrupts. @@ -1338,20 +1531,12 @@ static void reset_common_ring(struct intel_engine_cs *engine, * guessing the missed context-switch events by looking at what * requests were completed. */ - if (!request) { - for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++) - i915_gem_request_put(port_request(&port[n])); - memset(engine->execlist_port, 0, sizeof(engine->execlist_port)); - return; - } + execlist_cancel_port_requests(execlists); - if (request->ctx != port_request(port)->ctx) { - i915_gem_request_put(port_request(port)); - port[0] = port[1]; - memset(&port[1], 0, sizeof(port[1])); - } + /* Push back any incomplete requests for replay after the reset. */ + unwind_incomplete_requests(engine); - GEM_BUG_ON(request->ctx != port_request(port)->ctx); + spin_unlock_irqrestore(&engine->timeline->lock, flags); /* If the request was innocent, we leave the request in the ELSP * and will try to replay it on restarting. The context image may @@ -1363,7 +1548,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, * and have to at least restore the RING register in the context * image back to the expected values to skip over the guilty request. */ - if (request->fence.error != -EIO) + if (!request || request->fence.error != -EIO) return; /* We want a simple context + ring to execute the breadcrumb update. @@ -1386,10 +1571,7 @@ static void reset_common_ring(struct intel_engine_cs *engine, intel_ring_update_space(request->ring); /* Reset WaIdleLiteRestore:bdw,skl as well */ - request->tail = - intel_ring_wrap(request->ring, - request->wa_tail - WA_TAIL_DWORDS*sizeof(u32)); - assert_ring_tail_valid(request->ring, request->tail); + unwind_wa_tail(request); } static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req) @@ -1448,13 +1630,31 @@ static int gen8_emit_bb_start(struct drm_i915_gem_request *req, if (IS_ERR(cs)) return PTR_ERR(cs); + /* + * WaDisableCtxRestoreArbitration:bdw,chv + * + * We don't need to perform MI_ARB_ENABLE as often as we do (in + * particular all the gen that do not need the w/a at all!), if we + * took care to make sure that on every switch into this context + * (both ordinary and for preemption) that arbitrartion was enabled + * we would be fine. However, there doesn't seem to be a downside to + * being paranoid and making sure it is set before each batch and + * every context-switch. + * + * Note that if we fail to enable arbitration before the request + * is complete, then we do not see the context-switch interrupt and + * the engine hangs (with RING_HEAD == RING_TAIL). + * + * That satisfies both the GPGPU w/a and our heavy-handed paranoia. + */ + *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; + /* FIXME(BDW): Address space and security selectors. */ *cs++ = MI_BATCH_BUFFER_START_GEN8 | (flags & I915_DISPATCH_SECURE ? 0 : BIT(8)) | (flags & I915_DISPATCH_RS ? MI_BATCH_RESOURCE_STREAMER : 0); *cs++ = lower_32_bits(offset); *cs++ = upper_32_bits(offset); - *cs++ = MI_NOOP; intel_ring_advance(req, cs); return 0; @@ -1583,7 +1783,8 @@ static int gen8_emit_flush_render(struct drm_i915_gem_request *request, */ static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *cs) { - *cs++ = MI_NOOP; + /* Ensure there's always at least one preemption point per-request. */ + *cs++ = MI_ARB_CHECK; *cs++ = MI_NOOP; request->wa_tail = intel_ring_offset(request, cs); } @@ -1604,7 +1805,6 @@ static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request, u32 *cs) gen8_emit_wa_tail(request, cs); } - static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS; static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, @@ -1632,7 +1832,6 @@ static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request, gen8_emit_wa_tail(request, cs); } - static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS; static int gen8_init_rcs_context(struct drm_i915_gem_request *req) @@ -1666,8 +1865,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine) * Tasklet cannot be active at this point due intel_mark_active/idle * so this is just for documentation. */ - if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->irq_tasklet.state))) - tasklet_kill(&engine->irq_tasklet); + if (WARN_ON(test_bit(TASKLET_STATE_SCHED, &engine->execlists.irq_tasklet.state))) + tasklet_kill(&engine->execlists.irq_tasklet); dev_priv = engine->i915; @@ -1678,11 +1877,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine) if (engine->cleanup) engine->cleanup(engine); - if (engine->status_page.vma) { - i915_gem_object_unpin_map(engine->status_page.vma->obj); - engine->status_page.vma = NULL; - } - intel_engine_cleanup_common(engine); lrc_destroy_wa_ctx(engine); @@ -1694,8 +1888,9 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine) static void execlists_set_default_submission(struct intel_engine_cs *engine) { engine->submit_request = execlists_submit_request; + engine->cancel_requests = execlists_cancel_requests; engine->schedule = execlists_schedule; - engine->irq_tasklet.func = intel_lrc_irq_handler; + engine->execlists.irq_tasklet.func = intel_lrc_irq_handler; } static void @@ -1729,24 +1924,6 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; } -static int -lrc_setup_hws(struct intel_engine_cs *engine, struct i915_vma *vma) -{ - const int hws_offset = LRC_PPHWSP_PN * PAGE_SIZE; - void *hws; - - /* The HWSP is part of the default context object in LRC mode. */ - hws = i915_gem_object_pin_map(vma->obj, I915_MAP_WB); - if (IS_ERR(hws)) - return PTR_ERR(hws); - - engine->status_page.page_addr = hws + hws_offset; - engine->status_page.ggtt_offset = i915_ggtt_offset(vma) + hws_offset; - engine->status_page.vma = vma; - - return 0; -} - static void logical_ring_setup(struct intel_engine_cs *engine) { @@ -1770,32 +1947,23 @@ logical_ring_setup(struct intel_engine_cs *engine) RING_CONTEXT_STATUS_BUF_BASE(engine), FW_REG_READ); - engine->fw_domains = fw_domains; + engine->execlists.fw_domains = fw_domains; - tasklet_init(&engine->irq_tasklet, + tasklet_init(&engine->execlists.irq_tasklet, intel_lrc_irq_handler, (unsigned long)engine); logical_ring_default_vfuncs(engine); logical_ring_default_irqs(engine); } -static int -logical_ring_init(struct intel_engine_cs *engine) +static int logical_ring_init(struct intel_engine_cs *engine) { - struct i915_gem_context *dctx = engine->i915->kernel_context; int ret; ret = intel_engine_init_common(engine); if (ret) goto error; - /* And setup the hardware status page. */ - ret = lrc_setup_hws(engine, dctx->engine[engine->id].state); - if (ret) { - DRM_ERROR("Failed to set up hws %s: %d\n", engine->name, ret); - goto error; - } - return 0; error: @@ -1953,13 +2121,12 @@ static void execlists_init_reg_state(u32 *regs, CTX_REG(regs, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(base), 0); CTX_REG(regs, CTX_SECOND_BB_STATE, RING_SBBSTATE(base), 0); if (rcs) { - CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); + struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; + CTX_REG(regs, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(base), 0); CTX_REG(regs, CTX_RCS_INDIRECT_CTX_OFFSET, RING_INDIRECT_CTX_OFFSET(base), 0); - - if (engine->wa_ctx.vma) { - struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx; + if (wa_ctx->indirect_ctx.size) { u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); regs[CTX_RCS_INDIRECT_CTX + 1] = @@ -1968,6 +2135,11 @@ static void execlists_init_reg_state(u32 *regs, regs[CTX_RCS_INDIRECT_CTX_OFFSET + 1] = intel_lr_indirect_ctx_offset(engine) << 6; + } + + CTX_REG(regs, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(base), 0); + if (wa_ctx->per_ctx.size) { + u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma); regs[CTX_BB_PER_CTX_PTR + 1] = (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01; @@ -2052,8 +2224,11 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx, context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE); - /* One extra page as the sharing data between driver and GuC */ - context_size += PAGE_SIZE * LRC_PPHWSP_PN; + /* + * Before the actual start of the context image, we insert a few pages + * for our own use and for sharing with the GuC. + */ + context_size += LRC_HEADER_PAGES * PAGE_SIZE; ctx_obj = i915_gem_object_create(ctx->i915, context_size); if (IS_ERR(ctx_obj)) { |