diff options
Diffstat (limited to 'drivers/gpu/drm/i915')
35 files changed, 1049 insertions, 462 deletions
diff --git a/drivers/gpu/drm/i915/display/intel_ddi.c b/drivers/gpu/drm/i915/display/intel_ddi.c index aa22465bb56e..0575a1eea2a1 100644 --- a/drivers/gpu/drm/i915/display/intel_ddi.c +++ b/drivers/gpu/drm/i915/display/intel_ddi.c @@ -2579,14 +2579,14 @@ static void icl_ddi_vswing_sequence(struct intel_encoder *encoder, static void tgl_dkl_phy_ddi_vswing_sequence(struct intel_encoder *encoder, int link_clock, - u32 level) + u32 level, enum intel_output_type type) { struct drm_i915_private *dev_priv = to_i915(encoder->base.dev); enum tc_port tc_port = intel_port_to_tc(dev_priv, encoder->port); const struct tgl_dkl_phy_ddi_buf_trans *ddi_translations; u32 n_entries, val, ln, dpcnt_mask, dpcnt_val; - if (encoder->type == INTEL_OUTPUT_HDMI) { + if (type == INTEL_OUTPUT_HDMI) { n_entries = ARRAY_SIZE(tgl_dkl_phy_hdmi_ddi_trans); ddi_translations = tgl_dkl_phy_hdmi_ddi_trans; } else { @@ -2638,7 +2638,7 @@ static void tgl_ddi_vswing_sequence(struct intel_encoder *encoder, if (intel_phy_is_combo(dev_priv, phy)) icl_combo_phy_ddi_vswing_sequence(encoder, level, type); else - tgl_dkl_phy_ddi_vswing_sequence(encoder, link_clock, level); + tgl_dkl_phy_ddi_vswing_sequence(encoder, link_clock, level, type); } static u32 translate_signal_level(struct intel_dp *intel_dp, int signal_levels) @@ -2987,7 +2987,7 @@ icl_program_mg_dp_mode(struct intel_digital_port *intel_dig_port, ln1 = intel_de_read(dev_priv, MG_DP_MODE(1, tc_port)); } - ln0 &= ~(MG_DP_MODE_CFG_DP_X1_MODE | MG_DP_MODE_CFG_DP_X1_MODE); + ln0 &= ~(MG_DP_MODE_CFG_DP_X1_MODE | MG_DP_MODE_CFG_DP_X2_MODE); ln1 &= ~(MG_DP_MODE_CFG_DP_X1_MODE | MG_DP_MODE_CFG_DP_X2_MODE); /* DPPATC */ @@ -3472,7 +3472,9 @@ static void intel_ddi_post_disable_dp(struct intel_atomic_state *state, INTEL_OUTPUT_DP_MST); enum phy phy = intel_port_to_phy(dev_priv, encoder->port); - intel_dp_set_infoframes(encoder, false, old_crtc_state, old_conn_state); + if (!is_mst) + intel_dp_set_infoframes(encoder, false, + old_crtc_state, old_conn_state); /* * Power down sink before disabling the port, otherwise we end diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index 9ea1a397d1b5..26996e1839e2 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -3822,6 +3822,17 @@ skl_check_main_ccs_coordinates(struct intel_plane_state *plane_state, return true; } +unsigned int +intel_plane_fence_y_offset(const struct intel_plane_state *plane_state) +{ + int x = 0, y = 0; + + intel_plane_adjust_aligned_offset(&x, &y, plane_state, 0, + plane_state->color_plane[0].offset, 0); + + return y; +} + static int skl_check_main_surface(struct intel_plane_state *plane_state) { struct drm_i915_private *dev_priv = to_i915(plane_state->uapi.plane->dev); diff --git a/drivers/gpu/drm/i915/display/intel_display.h b/drivers/gpu/drm/i915/display/intel_display.h index efb4da205ea2..3a06f72c9859 100644 --- a/drivers/gpu/drm/i915/display/intel_display.h +++ b/drivers/gpu/drm/i915/display/intel_display.h @@ -608,6 +608,7 @@ unsigned int i9xx_plane_max_stride(struct intel_plane *plane, u32 pixel_format, u64 modifier, unsigned int rotation); int bdw_get_pipemisc_bpp(struct intel_crtc *crtc); +unsigned int intel_plane_fence_y_offset(const struct intel_plane_state *plane_state); struct intel_display_error_state * intel_display_capture_error_state(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/display/intel_dp_mst.c b/drivers/gpu/drm/i915/display/intel_dp_mst.c index d18b406f2a7d..f29e51ce489c 100644 --- a/drivers/gpu/drm/i915/display/intel_dp_mst.c +++ b/drivers/gpu/drm/i915/display/intel_dp_mst.c @@ -397,6 +397,14 @@ static void intel_mst_post_disable_dp(struct intel_atomic_state *state, */ drm_dp_send_power_updown_phy(&intel_dp->mst_mgr, connector->port, false); + + /* + * BSpec 4287: disable DIP after the transcoder is disabled and before + * the transcoder clock select is set to none. + */ + if (last_mst_stream) + intel_dp_set_infoframes(&intel_dig_port->base, false, + old_crtc_state, NULL); /* * From TGL spec: "If multi-stream slave transcoder: Configure * Transcoder Clock Select to direct no clock to the transcoder" diff --git a/drivers/gpu/drm/i915/display/intel_fbc.c b/drivers/gpu/drm/i915/display/intel_fbc.c index 1c26673acb2d..412572f88b67 100644 --- a/drivers/gpu/drm/i915/display/intel_fbc.c +++ b/drivers/gpu/drm/i915/display/intel_fbc.c @@ -48,19 +48,6 @@ #include "intel_frontbuffer.h" /* - * In some platforms where the CRTC's x:0/y:0 coordinates doesn't match the - * frontbuffer's x:0/y:0 coordinates we lie to the hardware about the plane's - * origin so the x and y offsets can actually fit the registers. As a - * consequence, the fence doesn't really start exactly at the display plane - * address we program because it starts at the real start of the buffer, so we - * have to take this into consideration here. - */ -static unsigned int get_crtc_fence_y_offset(struct intel_fbc *fbc) -{ - return fbc->state_cache.plane.y - fbc->state_cache.plane.adjusted_y; -} - -/* * For SKL+, the plane source size used by the hardware is based on the value we * write to the PLANE_SIZE register. For BDW-, the hardware looks at the value * we wrote to PIPESRC. @@ -141,7 +128,7 @@ static void i8xx_fbc_activate(struct drm_i915_private *dev_priv) fbc_ctl2 |= FBC_CTL_CPU_FENCE; intel_de_write(dev_priv, FBC_CONTROL2, fbc_ctl2); intel_de_write(dev_priv, FBC_FENCE_OFF, - params->crtc.fence_y_offset); + params->fence_y_offset); } /* enable it... */ @@ -175,7 +162,7 @@ static void g4x_fbc_activate(struct drm_i915_private *dev_priv) if (params->fence_id >= 0) { dpfc_ctl |= DPFC_CTL_FENCE_EN | params->fence_id; intel_de_write(dev_priv, DPFC_FENCE_YOFF, - params->crtc.fence_y_offset); + params->fence_y_offset); } else { intel_de_write(dev_priv, DPFC_FENCE_YOFF, 0); } @@ -243,7 +230,7 @@ static void ilk_fbc_activate(struct drm_i915_private *dev_priv) intel_de_write(dev_priv, SNB_DPFC_CTL_SA, SNB_CPU_FENCE_ENABLE | params->fence_id); intel_de_write(dev_priv, DPFC_CPU_FENCE_OFFSET, - params->crtc.fence_y_offset); + params->fence_y_offset); } } else { if (IS_GEN(dev_priv, 6)) { @@ -253,7 +240,7 @@ static void ilk_fbc_activate(struct drm_i915_private *dev_priv) } intel_de_write(dev_priv, ILK_DPFC_FENCE_YOFF, - params->crtc.fence_y_offset); + params->fence_y_offset); /* enable it... */ intel_de_write(dev_priv, ILK_DPFC_CONTROL, dpfc_ctl | DPFC_CTL_EN); @@ -320,7 +307,7 @@ static void gen7_fbc_activate(struct drm_i915_private *dev_priv) intel_de_write(dev_priv, SNB_DPFC_CTL_SA, SNB_CPU_FENCE_ENABLE | params->fence_id); intel_de_write(dev_priv, DPFC_CPU_FENCE_OFFSET, - params->crtc.fence_y_offset); + params->fence_y_offset); } else if (dev_priv->ggtt.num_fences) { intel_de_write(dev_priv, SNB_DPFC_CTL_SA, 0); intel_de_write(dev_priv, DPFC_CPU_FENCE_OFFSET, 0); @@ -631,8 +618,8 @@ static bool rotation_is_valid(struct drm_i915_private *dev_priv, /* * For some reason, the hardware tracking starts looking at whatever we * programmed as the display plane base address register. It does not look at - * the X and Y offset registers. That's why we look at the crtc->adjusted{x,y} - * variables instead of just looking at the pipe/plane size. + * the X and Y offset registers. That's why we include the src x/y offsets + * instead of just looking at the plane size. */ static bool intel_fbc_hw_tracking_covers_screen(struct intel_crtc *crtc) { @@ -705,7 +692,6 @@ static void intel_fbc_update_state_cache(struct intel_crtc *crtc, cache->plane.src_h = drm_rect_height(&plane_state->uapi.src) >> 16; cache->plane.adjusted_x = plane_state->color_plane[0].x; cache->plane.adjusted_y = plane_state->color_plane[0].y; - cache->plane.y = plane_state->uapi.src.y1 >> 16; cache->plane.pixel_blend_mode = plane_state->hw.pixel_blend_mode; @@ -713,6 +699,8 @@ static void intel_fbc_update_state_cache(struct intel_crtc *crtc, cache->fb.stride = fb->pitches[0]; cache->fb.modifier = fb->modifier; + cache->fence_y_offset = intel_plane_fence_y_offset(plane_state); + drm_WARN_ON(&dev_priv->drm, plane_state->flags & PLANE_HAS_FENCE && !plane_state->vma->fence); @@ -731,6 +719,25 @@ static bool intel_fbc_cfb_size_changed(struct drm_i915_private *dev_priv) fbc->compressed_fb.size * fbc->threshold; } +static u16 intel_fbc_gen9_wa_cfb_stride(struct drm_i915_private *dev_priv) +{ + struct intel_fbc *fbc = &dev_priv->fbc; + struct intel_fbc_state_cache *cache = &fbc->state_cache; + + if ((IS_GEN9_BC(dev_priv) || IS_BROXTON(dev_priv)) && + cache->fb.modifier != I915_FORMAT_MOD_X_TILED) + return DIV_ROUND_UP(cache->plane.src_w, 32 * fbc->threshold) * 8; + else + return 0; +} + +static bool intel_fbc_gen9_wa_cfb_stride_changed(struct drm_i915_private *dev_priv) +{ + struct intel_fbc *fbc = &dev_priv->fbc; + + return fbc->params.gen9_wa_cfb_stride != intel_fbc_gen9_wa_cfb_stride(dev_priv); +} + static bool intel_fbc_can_enable(struct drm_i915_private *dev_priv) { struct intel_fbc *fbc = &dev_priv->fbc; @@ -883,12 +890,13 @@ static void intel_fbc_get_reg_params(struct intel_crtc *crtc, memset(params, 0, sizeof(*params)); params->fence_id = cache->fence_id; + params->fence_y_offset = cache->fence_y_offset; params->crtc.pipe = crtc->pipe; params->crtc.i9xx_plane = to_intel_plane(crtc->base.primary)->i9xx_plane; - params->crtc.fence_y_offset = get_crtc_fence_y_offset(fbc); params->fb.format = cache->fb.format; + params->fb.modifier = cache->fb.modifier; params->fb.stride = cache->fb.stride; params->cfb_size = intel_fbc_calculate_cfb_size(dev_priv, cache); @@ -918,6 +926,9 @@ static bool intel_fbc_can_flip_nuke(const struct intel_crtc_state *crtc_state) if (params->fb.format != cache->fb.format) return false; + if (params->fb.modifier != cache->fb.modifier) + return false; + if (params->fb.stride != cache->fb.stride) return false; @@ -1197,7 +1208,8 @@ void intel_fbc_enable(struct intel_atomic_state *state, if (fbc->crtc) { if (fbc->crtc != crtc || - !intel_fbc_cfb_size_changed(dev_priv)) + (!intel_fbc_cfb_size_changed(dev_priv) && + !intel_fbc_gen9_wa_cfb_stride_changed(dev_priv))) goto out; __intel_fbc_disable(dev_priv); @@ -1219,12 +1231,7 @@ void intel_fbc_enable(struct intel_atomic_state *state, goto out; } - if ((IS_GEN9_BC(dev_priv) || IS_BROXTON(dev_priv)) && - plane_state->hw.fb->modifier != I915_FORMAT_MOD_X_TILED) - cache->gen9_wa_cfb_stride = - DIV_ROUND_UP(cache->plane.src_w, 32 * fbc->threshold) * 8; - else - cache->gen9_wa_cfb_stride = 0; + cache->gen9_wa_cfb_stride = intel_fbc_gen9_wa_cfb_stride(dev_priv); drm_dbg_kms(&dev_priv->drm, "Enabling FBC on pipe %c\n", pipe_name(crtc->pipe)); diff --git a/drivers/gpu/drm/i915/display/intel_hdmi.c b/drivers/gpu/drm/i915/display/intel_hdmi.c index 010f37240710..95b6d9457910 100644 --- a/drivers/gpu/drm/i915/display/intel_hdmi.c +++ b/drivers/gpu/drm/i915/display/intel_hdmi.c @@ -2867,19 +2867,13 @@ intel_hdmi_connector_register(struct drm_connector *connector) return ret; } -static void intel_hdmi_destroy(struct drm_connector *connector) +static void intel_hdmi_connector_unregister(struct drm_connector *connector) { struct cec_notifier *n = intel_attached_hdmi(to_intel_connector(connector))->cec_notifier; cec_notifier_conn_unregister(n); - intel_connector_destroy(connector); -} - -static void intel_hdmi_connector_unregister(struct drm_connector *connector) -{ intel_hdmi_remove_i2c_symlink(connector); - intel_connector_unregister(connector); } @@ -2891,7 +2885,7 @@ static const struct drm_connector_funcs intel_hdmi_connector_funcs = { .atomic_set_property = intel_digital_connector_atomic_set_property, .late_register = intel_hdmi_connector_register, .early_unregister = intel_hdmi_connector_unregister, - .destroy = intel_hdmi_destroy, + .destroy = intel_connector_destroy, .atomic_destroy_state = drm_atomic_helper_connector_destroy_state, .atomic_duplicate_state = intel_digital_connector_duplicate_state, }; diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c index e4aece20bc80..52db2bde44a3 100644 --- a/drivers/gpu/drm/i915/gt/intel_context.c +++ b/drivers/gpu/drm/i915/gt/intel_context.c @@ -204,25 +204,25 @@ static int __ring_active(struct intel_ring *ring) { int err; - err = i915_active_acquire(&ring->vma->active); + err = intel_ring_pin(ring); if (err) return err; - err = intel_ring_pin(ring); + err = i915_active_acquire(&ring->vma->active); if (err) - goto err_active; + goto err_pin; return 0; -err_active: - i915_active_release(&ring->vma->active); +err_pin: + intel_ring_unpin(ring); return err; } static void __ring_retire(struct intel_ring *ring) { - intel_ring_unpin(ring); i915_active_release(&ring->vma->active); + intel_ring_unpin(ring); } __i915_active_call diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index da5b61085257..8691eb61e185 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -646,7 +646,7 @@ static int engine_setup_common(struct intel_engine_cs *engine) struct measure_breadcrumb { struct i915_request rq; struct intel_ring ring; - u32 cs[1024]; + u32 cs[2048]; }; static int measure_breadcrumb_dw(struct intel_context *ce) @@ -668,6 +668,8 @@ static int measure_breadcrumb_dw(struct intel_context *ce) frame->ring.vaddr = frame->cs; frame->ring.size = sizeof(frame->cs); + frame->ring.wrap = + BITS_PER_TYPE(frame->ring.size) - ilog2(frame->ring.size); frame->ring.effective_size = frame->ring.size; intel_ring_update_space(&frame->ring); frame->rq.ring = &frame->ring; diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 87e6c5bdd2dc..cb07e1d2a353 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1134,6 +1134,13 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) list_move(&rq->sched.link, pl); set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags); + /* Check in case we rollback so far we wrap [size/2] */ + if (intel_ring_direction(rq->ring, + intel_ring_wrap(rq->ring, + rq->tail), + rq->ring->tail) > 0) + rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE; + active = rq; } else { struct intel_engine_cs *owner = rq->context->engine; @@ -1498,8 +1505,9 @@ static u64 execlists_update_context(struct i915_request *rq) * HW has a tendency to ignore us rewinding the TAIL to the end of * an earlier request. */ + GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail); + prev = rq->ring->tail; tail = intel_ring_set_tail(rq->ring, rq->tail); - prev = ce->lrc_reg_state[CTX_RING_TAIL]; if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0)) desc |= CTX_DESC_FORCE_RESTORE; ce->lrc_reg_state[CTX_RING_TAIL] = tail; @@ -1895,7 +1903,8 @@ static void defer_active(struct intel_engine_cs *engine) static bool need_timeslice(const struct intel_engine_cs *engine, - const struct i915_request *rq) + const struct i915_request *rq, + const struct rb_node *rb) { int hint; @@ -1903,9 +1912,28 @@ need_timeslice(const struct intel_engine_cs *engine, return false; hint = engine->execlists.queue_priority_hint; + + if (rb) { + const struct virtual_engine *ve = + rb_entry(rb, typeof(*ve), nodes[engine->id].rb); + const struct intel_engine_cs *inflight = + intel_context_inflight(&ve->context); + + if (!inflight || inflight == engine) { + struct i915_request *next; + + rcu_read_lock(); + next = READ_ONCE(ve->request); + if (next) + hint = max(hint, rq_prio(next)); + rcu_read_unlock(); + } + } + if (!list_is_last(&rq->sched.link, &engine->active.requests)) hint = max(hint, rq_prio(list_next_entry(rq, sched.link))); + GEM_BUG_ON(hint >= I915_PRIORITY_UNPREEMPTABLE); return hint >= effective_prio(rq); } @@ -1977,10 +2005,9 @@ static void set_timeslice(struct intel_engine_cs *engine) set_timer_ms(&engine->execlists.timer, duration); } -static void start_timeslice(struct intel_engine_cs *engine) +static void start_timeslice(struct intel_engine_cs *engine, int prio) { struct intel_engine_execlists *execlists = &engine->execlists; - const int prio = queue_prio(execlists); unsigned long duration; if (!intel_engine_has_timeslices(engine)) @@ -2140,7 +2167,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) __unwind_incomplete_requests(engine); last = NULL; - } else if (need_timeslice(engine, last) && + } else if (need_timeslice(engine, last, rb) && timeslice_expired(execlists, last)) { if (i915_request_completed(last)) { tasklet_hi_schedule(&execlists->tasklet); @@ -2188,7 +2215,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) * Even if ELSP[1] is occupied and not worthy * of timeslices, our queue might be. */ - start_timeslice(engine); + start_timeslice(engine, queue_prio(execlists)); return; } } @@ -2223,7 +2250,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (last && !can_merge_rq(last, rq)) { spin_unlock(&ve->base.active.lock); - start_timeslice(engine); + start_timeslice(engine, rq_prio(rq)); return; /* leave this for another sibling */ } @@ -4739,6 +4766,14 @@ static int gen12_emit_flush(struct i915_request *request, u32 mode) return 0; } +static void assert_request_valid(struct i915_request *rq) +{ + struct intel_ring *ring __maybe_unused = rq->ring; + + /* Can we unwind this request without appearing to go forwards? */ + GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0); +} + /* * Reserve space for 2 NOOPs at the end of each request to be * used as a workaround for not being allowed to do lite @@ -4751,6 +4786,9 @@ static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) *cs++ = MI_NOOP; request->wa_tail = intel_ring_offset(request, cs); + /* Check that entire request is less than half the ring */ + assert_request_valid(request); + return cs; } @@ -5358,13 +5396,8 @@ static void virtual_engine_initial_hint(struct virtual_engine *ve) * typically be the first we inspect for submission. */ swp = prandom_u32_max(ve->num_siblings); - if (!swp) - return; - - swap(ve->siblings[swp], ve->siblings[0]); - if (!intel_engine_has_relative_mmio(ve->siblings[0])) - virtual_update_register_offsets(ve->context.lrc_reg_state, - ve->siblings[0]); + if (swp) + swap(ve->siblings[swp], ve->siblings[0]); } static int virtual_context_alloc(struct intel_context *ce) @@ -5377,15 +5410,9 @@ static int virtual_context_alloc(struct intel_context *ce) static int virtual_context_pin(struct intel_context *ce) { struct virtual_engine *ve = container_of(ce, typeof(*ve), context); - int err; /* Note: we must use a real engine class for setting up reg state */ - err = __execlists_context_pin(ce, ve->siblings[0]); - if (err) - return err; - - virtual_engine_initial_hint(ve); - return 0; + return __execlists_context_pin(ce, ve->siblings[0]); } static void virtual_context_enter(struct intel_context *ce) @@ -5650,6 +5677,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, intel_engine_init_active(&ve->base, ENGINE_VIRTUAL); intel_engine_init_breadcrumbs(&ve->base); intel_engine_init_execlists(&ve->base); + ve->base.breadcrumbs.irq_armed = true; /* fake HW, used for irq_work */ ve->base.cops = &virtual_context_ops; ve->base.request_alloc = execlists_request_alloc; @@ -5731,6 +5759,7 @@ intel_execlists_create_virtual(struct intel_engine_cs **siblings, ve->base.flags |= I915_ENGINE_IS_VIRTUAL; + virtual_engine_initial_hint(ve); return &ve->context; err_put: diff --git a/drivers/gpu/drm/i915/gt/intel_ring.c b/drivers/gpu/drm/i915/gt/intel_ring.c index 8cda1b7e17ba..bdb324167ef3 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring.c +++ b/drivers/gpu/drm/i915/gt/intel_ring.c @@ -315,3 +315,7 @@ int intel_ring_cacheline_align(struct i915_request *rq) GEM_BUG_ON(rq->ring->emit & (CACHELINE_BYTES - 1)); return 0; } + +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) +#include "selftest_ring.c" +#endif diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 90a2b9e399b0..85d2bef51524 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -179,6 +179,12 @@ wa_write_or(struct i915_wa_list *wal, i915_reg_t reg, u32 set) } static void +wa_write_clr(struct i915_wa_list *wal, i915_reg_t reg, u32 clr) +{ + wa_write_masked_or(wal, reg, clr, 0); +} + +static void wa_masked_en(struct i915_wa_list *wal, i915_reg_t reg, u32 val) { wa_add(wal, reg, 0, _MASKED_BIT_ENABLE(val), val); @@ -687,6 +693,227 @@ int intel_engine_emit_ctx_wa(struct i915_request *rq) } static void +gen4_gt_workarounds_init(struct drm_i915_private *i915, + struct i915_wa_list *wal) +{ + /* WaDisable_RenderCache_OperationalFlush:gen4,ilk */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); +} + +static void +g4x_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + gen4_gt_workarounds_init(i915, wal); + + /* WaDisableRenderCachePipelinedFlush:g4x,ilk */ + wa_masked_en(wal, CACHE_MODE_0, CM0_PIPELINED_RENDER_FLUSH_DISABLE); +} + +static void +ilk_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + g4x_gt_workarounds_init(i915, wal); + + wa_masked_en(wal, _3D_CHICKEN2, _3D_CHICKEN2_WM_READ_PIPELINED); +} + +static void +snb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ + wa_masked_en(wal, + _3D_CHICKEN, + _3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB); + + /* WaDisable_RenderCache_OperationalFlush:snb */ + wa_masked_dis(wal, CACHE_MODE_0, RC_OP_FLUSH_ENABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, + GEN6_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + wa_masked_dis(wal, CACHE_MODE_0, CM0_STC_EVICT_DISABLE_LRA_SNB); + + wa_masked_en(wal, + _3D_CHICKEN3, + /* WaStripsFansDisableFastClipPerformanceFix:snb */ + _3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL | + /* + * Bspec says: + * "This bit must be set if 3DSTATE_CLIP clip mode is set + * to normal and 3DSTATE_SF number of SF output attributes + * is more than 16." + */ + _3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH); +} + +static void +ivb_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:ivb */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaDisablePSDDualDispatchEnable:ivb */ + if (IS_IVB_GT1(i915)) + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:ivb */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ + wa_masked_dis(wal, + GEN7_COMMON_SLICE_CHICKEN1, + GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); + + /* WaApplyL3ControlAndL3ChickenMode:ivb */ + wa_write(wal, GEN7_L3CNTLREG1, GEN7_WA_FOR_GEN7_L3_CONTROL); + wa_write(wal, GEN7_L3_CHICKEN_MODE_REGISTER, GEN7_WA_L3_CHICKEN_MODE); + + /* WaForceL3Serialization:ivb */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + if (0) { /* causes HiZ corruption on ivb:gt1 */ + /* enable HiZ Raw Stall Optimization */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, HIZ_RAW_STALL_OPT_DISABLE); + } + + /* WaDisable4x2SubspanOptimization:ivb */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); +} + +static void +vlv_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* WaDisableEarlyCull:vlv */ + wa_masked_en(wal, _3D_CHICKEN3, _3D_CHICKEN_SF_DISABLE_OBJEND_CULL); + + /* WaPsdDispatchEnable:vlv */ + /* WaDisablePSDDualDispatchEnable:vlv */ + wa_masked_en(wal, + GEN7_HALF_SLICE_CHICKEN1, + GEN7_MAX_PS_THREAD_DEP | + GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); + + /* WaDisable_RenderCache_OperationalFlush:vlv */ + wa_masked_dis(wal, CACHE_MODE_0_GEN7, RC_OP_FLUSH_ENABLE); + + /* WaForceL3Serialization:vlv */ + wa_write_clr(wal, GEN7_L3SQCREG4, L3SQ_URB_READ_CAM_MATCH_DISABLE); + + /* + * WaVSThreadDispatchOverride:ivb,vlv + * + * This actually overrides the dispatch + * mode for all thread types. + */ + wa_write_masked_or(wal, + GEN7_FF_THREAD_MODE, + GEN7_FF_SCHED_MASK, + GEN7_FF_TS_SCHED_HW | + GEN7_FF_VS_SCHED_HW | + GEN7_FF_DS_SCHED_HW); + + /* + * BSpec says this must be set, even though + * WaDisable4x2SubspanOptimization isn't listed for VLV. + */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* + * WaIncreaseL3CreditsForVLVB0:vlv + * This is the hardware default actually. + */ + wa_write(wal, GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE); +} + +static void +hsw_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) +{ + /* L3 caching of data atomics doesn't work -- disable it. */ + wa_write(wal, HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); + + wa_add(wal, + HSW_ROW_CHICKEN3, 0, + _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE), + 0 /* XXX does this reg exist? */); + + /* WaVSRefCountFullforceMissDisable:hsw */ + wa_write_clr(wal, GEN7_FF_THREAD_MODE, GEN7_FF_VS_REF_CNT_FFME); + + wa_masked_dis(wal, + CACHE_MODE_0_GEN7, + /* WaDisable_RenderCache_OperationalFlush:hsw */ + RC_OP_FLUSH_ENABLE | + /* enable HiZ Raw Stall Optimization */ + HIZ_RAW_STALL_OPT_DISABLE); + + /* WaDisable4x2SubspanOptimization:hsw */ + wa_masked_en(wal, CACHE_MODE_1, PIXEL_SUBSPAN_COLLECT_OPT_DISABLE); + + /* + * BSpec recommends 8x4 when MSAA is used, + * however in practice 16x4 seems fastest. + * + * Note that PS/WM thread counts depend on the WIZ hashing + * disable bit, which we don't touch here, but it's good + * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). + */ + wa_add(wal, GEN7_GT_MODE, 0, + _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4), + GEN6_WIZ_HASHING_16x4); + + /* WaSampleCChickenBitEnable:hsw */ + wa_masked_en(wal, HALF_SLICE_CHICKEN3, HSW_SAMPLE_C_PERFORMANCE); +} + +static void gen9_gt_workarounds_init(struct drm_i915_private *i915, struct i915_wa_list *wal) { /* WaDisableKillLogic:bxt,skl,kbl */ @@ -963,6 +1190,20 @@ gt_init_workarounds(struct drm_i915_private *i915, struct i915_wa_list *wal) bxt_gt_workarounds_init(i915, wal); else if (IS_SKYLAKE(i915)) skl_gt_workarounds_init(i915, wal); + else if (IS_HASWELL(i915)) + hsw_gt_workarounds_init(i915, wal); + else if (IS_VALLEYVIEW(i915)) + vlv_gt_workarounds_init(i915, wal); + else if (IS_IVYBRIDGE(i915)) + ivb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 6)) + snb_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 5)) + ilk_gt_workarounds_init(i915, wal); + else if (IS_G4X(i915)) + g4x_gt_workarounds_init(i915, wal); + else if (IS_GEN(i915, 4)) + gen4_gt_workarounds_init(i915, wal); else if (INTEL_GEN(i915) <= 8) return; else diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c index 2b2efff6e19d..4aa4cc917d8b 100644 --- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c +++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c @@ -310,22 +310,20 @@ static bool wait_until_running(struct hang *h, struct i915_request *rq) 1000)); } -static void engine_heartbeat_disable(struct intel_engine_cs *engine, - unsigned long *saved) +static void engine_heartbeat_disable(struct intel_engine_cs *engine) { - *saved = engine->props.heartbeat_interval_ms; engine->props.heartbeat_interval_ms = 0; intel_engine_pm_get(engine); intel_engine_park_heartbeat(engine); } -static void engine_heartbeat_enable(struct intel_engine_cs *engine, - unsigned long saved) +static void engine_heartbeat_enable(struct intel_engine_cs *engine) { intel_engine_pm_put(engine); - engine->props.heartbeat_interval_ms = saved; + engine->props.heartbeat_interval_ms = + engine->defaults.heartbeat_interval_ms; } static int igt_hang_sanitycheck(void *arg) @@ -473,7 +471,6 @@ static int igt_reset_nop_engine(void *arg) for_each_engine(engine, gt, id) { unsigned int reset_count, reset_engine_count, count; struct intel_context *ce; - unsigned long heartbeat; IGT_TIMEOUT(end_time); int err; @@ -485,7 +482,7 @@ static int igt_reset_nop_engine(void *arg) reset_engine_count = i915_reset_engine_count(global, engine); count = 0; - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); set_bit(I915_RESET_ENGINE + id, >->reset.flags); do { int i; @@ -529,7 +526,7 @@ static int igt_reset_nop_engine(void *arg) } } while (time_before(jiffies, end_time)); clear_bit(I915_RESET_ENGINE + id, >->reset.flags); - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); pr_info("%s(%s): %d resets\n", __func__, engine->name, count); @@ -564,7 +561,6 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active) for_each_engine(engine, gt, id) { unsigned int reset_count, reset_engine_count; - unsigned long heartbeat; IGT_TIMEOUT(end_time); if (active && !intel_engine_can_store_dword(engine)) @@ -580,7 +576,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active) reset_count = i915_reset_count(global); reset_engine_count = i915_reset_engine_count(global, engine); - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); set_bit(I915_RESET_ENGINE + id, >->reset.flags); do { if (active) { @@ -632,7 +628,7 @@ static int __igt_reset_engine(struct intel_gt *gt, bool active) } } while (time_before(jiffies, end_time)); clear_bit(I915_RESET_ENGINE + id, >->reset.flags); - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); if (err) break; @@ -789,7 +785,6 @@ static int __igt_reset_engines(struct intel_gt *gt, struct active_engine threads[I915_NUM_ENGINES] = {}; unsigned long device = i915_reset_count(global); unsigned long count = 0, reported; - unsigned long heartbeat; IGT_TIMEOUT(end_time); if (flags & TEST_ACTIVE && @@ -832,7 +827,7 @@ static int __igt_reset_engines(struct intel_gt *gt, yield(); /* start all threads before we begin */ - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); set_bit(I915_RESET_ENGINE + id, >->reset.flags); do { struct i915_request *rq = NULL; @@ -906,7 +901,7 @@ static int __igt_reset_engines(struct intel_gt *gt, } } while (time_before(jiffies, end_time)); clear_bit(I915_RESET_ENGINE + id, >->reset.flags); - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); pr_info("i915_reset_engine(%s:%s): %lu resets\n", engine->name, test_name, count); diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 824f99c4cc7c..924bc01ef526 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -51,22 +51,20 @@ static struct i915_vma *create_scratch(struct intel_gt *gt) return vma; } -static void engine_heartbeat_disable(struct intel_engine_cs *engine, - unsigned long *saved) +static void engine_heartbeat_disable(struct intel_engine_cs *engine) { - *saved = engine->props.heartbeat_interval_ms; engine->props.heartbeat_interval_ms = 0; intel_engine_pm_get(engine); intel_engine_park_heartbeat(engine); } -static void engine_heartbeat_enable(struct intel_engine_cs *engine, - unsigned long saved) +static void engine_heartbeat_enable(struct intel_engine_cs *engine) { intel_engine_pm_put(engine); - engine->props.heartbeat_interval_ms = saved; + engine->props.heartbeat_interval_ms = + engine->defaults.heartbeat_interval_ms; } static bool is_active(struct i915_request *rq) @@ -224,7 +222,6 @@ static int live_unlite_restore(struct intel_gt *gt, int prio) struct intel_context *ce[2] = {}; struct i915_request *rq[2]; struct igt_live_test t; - unsigned long saved; int n; if (prio && !intel_engine_has_preemption(engine)) @@ -237,7 +234,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio) err = -EIO; break; } - engine_heartbeat_disable(engine, &saved); + engine_heartbeat_disable(engine); for (n = 0; n < ARRAY_SIZE(ce); n++) { struct intel_context *tmp; @@ -345,7 +342,7 @@ err_ce: intel_context_put(ce[n]); } - engine_heartbeat_enable(engine, saved); + engine_heartbeat_enable(engine); if (igt_live_test_end(&t)) err = -EIO; if (err) @@ -466,7 +463,6 @@ static int live_hold_reset(void *arg) for_each_engine(engine, gt, id) { struct intel_context *ce; - unsigned long heartbeat; struct i915_request *rq; ce = intel_context_create(engine); @@ -475,7 +471,7 @@ static int live_hold_reset(void *arg) break; } - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); if (IS_ERR(rq)) { @@ -535,7 +531,7 @@ static int live_hold_reset(void *arg) i915_request_put(rq); out: - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); intel_context_put(ce); if (err) break; @@ -580,10 +576,9 @@ static int live_error_interrupt(void *arg) for_each_engine(engine, gt, id) { const struct error_phase *p; - unsigned long heartbeat; int err = 0; - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); for (p = phases; p->error[0] != GOOD; p++) { struct i915_request *client[ARRAY_SIZE(phases->error)]; @@ -682,7 +677,7 @@ out: } } - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); if (err) { intel_gt_set_wedged(gt); return err; @@ -828,7 +823,7 @@ slice_semaphore_queue(struct intel_engine_cs *outer, } } - err = release_queue(outer, vma, n, INT_MAX); + err = release_queue(outer, vma, n, I915_PRIORITY_BARRIER); if (err) goto out; @@ -895,16 +890,14 @@ static int live_timeslice_preempt(void *arg) enum intel_engine_id id; for_each_engine(engine, gt, id) { - unsigned long saved; - if (!intel_engine_has_preemption(engine)) continue; memset(vaddr, 0, PAGE_SIZE); - engine_heartbeat_disable(engine, &saved); + engine_heartbeat_disable(engine); err = slice_semaphore_queue(engine, vma, count); - engine_heartbeat_enable(engine, saved); + engine_heartbeat_enable(engine); if (err) goto err_pin; @@ -1009,7 +1002,6 @@ static int live_timeslice_rewind(void *arg) enum { X = 1, Z, Y }; struct i915_request *rq[3] = {}; struct intel_context *ce; - unsigned long heartbeat; unsigned long timeslice; int i, err = 0; u32 *slot; @@ -1028,7 +1020,7 @@ static int live_timeslice_rewind(void *arg) * Expect execution/evaluation order XZY */ - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); timeslice = xchg(&engine->props.timeslice_duration_ms, 1); slot = memset32(engine->status_page.addr + 1000, 0, 4); @@ -1122,7 +1114,7 @@ err: wmb(); engine->props.timeslice_duration_ms = timeslice; - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); for (i = 0; i < 3; i++) i915_request_put(rq[i]); if (igt_flush_test(gt->i915)) @@ -1202,12 +1194,11 @@ static int live_timeslice_queue(void *arg) .priority = I915_USER_PRIORITY(I915_PRIORITY_MAX), }; struct i915_request *rq, *nop; - unsigned long saved; if (!intel_engine_has_preemption(engine)) continue; - engine_heartbeat_disable(engine, &saved); + engine_heartbeat_disable(engine); memset(vaddr, 0, PAGE_SIZE); /* ELSP[0]: semaphore wait */ @@ -1284,7 +1275,7 @@ static int live_timeslice_queue(void *arg) err_rq: i915_request_put(rq); err_heartbeat: - engine_heartbeat_enable(engine, saved); + engine_heartbeat_enable(engine); if (err) break; } @@ -1298,6 +1289,121 @@ err_obj: return err; } +static int live_timeslice_nopreempt(void *arg) +{ + struct intel_gt *gt = arg; + struct intel_engine_cs *engine; + enum intel_engine_id id; + struct igt_spinner spin; + int err = 0; + + /* + * We should not timeslice into a request that is marked with + * I915_REQUEST_NOPREEMPT. + */ + if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION)) + return 0; + + if (igt_spinner_init(&spin, gt)) + return -ENOMEM; + + for_each_engine(engine, gt, id) { + struct intel_context *ce; + struct i915_request *rq; + unsigned long timeslice; + + if (!intel_engine_has_preemption(engine)) + continue; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(ce); + break; + } + + engine_heartbeat_disable(engine); + timeslice = xchg(&engine->props.timeslice_duration_ms, 1); + + /* Create an unpreemptible spinner */ + + rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_heartbeat; + } + + i915_request_get(rq); + i915_request_add(rq); + + if (!igt_wait_for_spinner(&spin, rq)) { + i915_request_put(rq); + err = -ETIME; + goto out_spin; + } + + set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags); + i915_request_put(rq); + + /* Followed by a maximum priority barrier (heartbeat) */ + + ce = intel_context_create(engine); + if (IS_ERR(ce)) { + err = PTR_ERR(rq); + goto out_spin; + } + + rq = intel_context_create_request(ce); + intel_context_put(ce); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto out_spin; + } + + rq->sched.attr.priority = I915_PRIORITY_BARRIER; + i915_request_get(rq); + i915_request_add(rq); + + /* + * Wait until the barrier is in ELSP, and we know timeslicing + * will have been activated. + */ + if (wait_for_submit(engine, rq, HZ / 2)) { + i915_request_put(rq); + err = -ETIME; + goto out_spin; + } + + /* + * Since the ELSP[0] request is unpreemptible, it should not + * allow the maximum priority barrier through. Wait long + * enough to see if it is timesliced in by mistake. + */ + if (i915_request_wait(rq, 0, timeslice_threshold(engine)) >= 0) { + pr_err("%s: I915_PRIORITY_BARRIER request completed, bypassing no-preempt request\n", + engine->name); + err = -EINVAL; + } + i915_request_put(rq); + +out_spin: + igt_spinner_end(&spin); +out_heartbeat: + xchg(&engine->props.timeslice_duration_ms, timeslice); + engine_heartbeat_enable(engine); + if (err) + break; + + if (igt_flush_test(gt->i915)) { + err = -EIO; + break; + } + } + + igt_spinner_fini(&spin); + return err; +} + static int live_busywait_preempt(void *arg) { struct intel_gt *gt = arg; @@ -4153,7 +4259,6 @@ static int reset_virtual_engine(struct intel_gt *gt, { struct intel_engine_cs *engine; struct intel_context *ve; - unsigned long *heartbeat; struct igt_spinner spin; struct i915_request *rq; unsigned int n; @@ -4165,15 +4270,9 @@ static int reset_virtual_engine(struct intel_gt *gt, * descendents are not executed while the capture is in progress. */ - heartbeat = kmalloc_array(nsibling, sizeof(*heartbeat), GFP_KERNEL); - if (!heartbeat) + if (igt_spinner_init(&spin, gt)) return -ENOMEM; - if (igt_spinner_init(&spin, gt)) { - err = -ENOMEM; - goto out_free; - } - ve = intel_execlists_create_virtual(siblings, nsibling); if (IS_ERR(ve)) { err = PTR_ERR(ve); @@ -4181,7 +4280,7 @@ static int reset_virtual_engine(struct intel_gt *gt, } for (n = 0; n < nsibling; n++) - engine_heartbeat_disable(siblings[n], &heartbeat[n]); + engine_heartbeat_disable(siblings[n]); rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK); if (IS_ERR(rq)) { @@ -4252,13 +4351,11 @@ out_rq: i915_request_put(rq); out_heartbeat: for (n = 0; n < nsibling; n++) - engine_heartbeat_enable(siblings[n], heartbeat[n]); + engine_heartbeat_enable(siblings[n]); intel_context_put(ve); out_spin: igt_spinner_fini(&spin); -out_free: - kfree(heartbeat); return err; } @@ -4314,6 +4411,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915) SUBTEST(live_timeslice_preempt), SUBTEST(live_timeslice_rewind), SUBTEST(live_timeslice_queue), + SUBTEST(live_timeslice_nopreempt), SUBTEST(live_busywait_preempt), SUBTEST(live_preempt), SUBTEST(live_late_preempt), @@ -4932,9 +5030,7 @@ static int live_lrc_gpr(void *arg) return PTR_ERR(scratch); for_each_engine(engine, gt, id) { - unsigned long heartbeat; - - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); err = __live_lrc_gpr(engine, scratch, false); if (err) @@ -4945,7 +5041,7 @@ static int live_lrc_gpr(void *arg) goto err; err: - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); if (igt_flush_test(gt->i915)) err = -EIO; if (err) @@ -5092,10 +5188,9 @@ static int live_lrc_timestamp(void *arg) */ for_each_engine(data.engine, gt, id) { - unsigned long heartbeat; int i, err = 0; - engine_heartbeat_disable(data.engine, &heartbeat); + engine_heartbeat_disable(data.engine); for (i = 0; i < ARRAY_SIZE(data.ce); i++) { struct intel_context *tmp; @@ -5128,7 +5223,7 @@ static int live_lrc_timestamp(void *arg) } err: - engine_heartbeat_enable(data.engine, heartbeat); + engine_heartbeat_enable(data.engine); for (i = 0; i < ARRAY_SIZE(data.ce); i++) { if (!data.ce[i]) break; diff --git a/drivers/gpu/drm/i915/gt/selftest_mocs.c b/drivers/gpu/drm/i915/gt/selftest_mocs.c index 8831ffee2061..63f87d8608c3 100644 --- a/drivers/gpu/drm/i915/gt/selftest_mocs.c +++ b/drivers/gpu/drm/i915/gt/selftest_mocs.c @@ -18,6 +18,20 @@ struct live_mocs { void *vaddr; }; +static struct intel_context *mocs_context_create(struct intel_engine_cs *engine) +{ + struct intel_context *ce; + + ce = intel_context_create(engine); + if (IS_ERR(ce)) + return ce; + + /* We build large requests to read the registers from the ring */ + ce->ring = __intel_context_ring_size(SZ_16K); + + return ce; +} + static int request_add_sync(struct i915_request *rq, int err) { i915_request_get(rq); @@ -301,7 +315,7 @@ static int live_mocs_clean(void *arg) for_each_engine(engine, gt, id) { struct intel_context *ce; - ce = intel_context_create(engine); + ce = mocs_context_create(engine); if (IS_ERR(ce)) { err = PTR_ERR(ce); break; @@ -395,7 +409,7 @@ static int live_mocs_reset(void *arg) for_each_engine(engine, gt, id) { struct intel_context *ce; - ce = intel_context_create(engine); + ce = mocs_context_create(engine); if (IS_ERR(ce)) { err = PTR_ERR(ce); break; diff --git a/drivers/gpu/drm/i915/gt/selftest_ring.c b/drivers/gpu/drm/i915/gt/selftest_ring.c new file mode 100644 index 000000000000..2a8c534dc125 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/selftest_ring.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2020 Intel Corporation + */ + +static struct intel_ring *mock_ring(unsigned long sz) +{ + struct intel_ring *ring; + + ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL); + if (!ring) + return NULL; + + kref_init(&ring->ref); + ring->size = sz; + ring->wrap = BITS_PER_TYPE(ring->size) - ilog2(sz); + ring->effective_size = sz; + ring->vaddr = (void *)(ring + 1); + atomic_set(&ring->pin_count, 1); + + intel_ring_update_space(ring); + + return ring; +} + +static void mock_ring_free(struct intel_ring *ring) +{ + kfree(ring); +} + +static int check_ring_direction(struct intel_ring *ring, + u32 next, u32 prev, + int expected) +{ + int result; + + result = intel_ring_direction(ring, next, prev); + if (result < 0) + result = -1; + else if (result > 0) + result = 1; + + if (result != expected) { + pr_err("intel_ring_direction(%u, %u):%d != %d\n", + next, prev, result, expected); + return -EINVAL; + } + + return 0; +} + +static int check_ring_step(struct intel_ring *ring, u32 x, u32 step) +{ + u32 prev = x, next = intel_ring_wrap(ring, x + step); + int err = 0; + + err |= check_ring_direction(ring, next, next, 0); + err |= check_ring_direction(ring, prev, prev, 0); + err |= check_ring_direction(ring, next, prev, 1); + err |= check_ring_direction(ring, prev, next, -1); + + return err; +} + +static int check_ring_offset(struct intel_ring *ring, u32 x, u32 step) +{ + int err = 0; + + err |= check_ring_step(ring, x, step); + err |= check_ring_step(ring, intel_ring_wrap(ring, x + 1), step); + err |= check_ring_step(ring, intel_ring_wrap(ring, x - 1), step); + + return err; +} + +static int igt_ring_direction(void *dummy) +{ + struct intel_ring *ring; + unsigned int half = 2048; + int step, err = 0; + + ring = mock_ring(2 * half); + if (!ring) + return -ENOMEM; + + GEM_BUG_ON(ring->size != 2 * half); + + /* Precision of wrap detection is limited to ring->size / 2 */ + for (step = 1; step < half; step <<= 1) { + err |= check_ring_offset(ring, 0, step); + err |= check_ring_offset(ring, half, step); + } + err |= check_ring_step(ring, 0, half - 64); + + /* And check unwrapped handling for good measure */ + err |= check_ring_offset(ring, 0, 2 * half + 64); + err |= check_ring_offset(ring, 3 * half, 1); + + mock_ring_free(ring); + return err; +} + +int intel_ring_mock_selftests(void) +{ + static const struct i915_subtest tests[] = { + SUBTEST(igt_ring_direction), + }; + + return i915_subtests(tests, NULL); +} diff --git a/drivers/gpu/drm/i915/gt/selftest_rps.c b/drivers/gpu/drm/i915/gt/selftest_rps.c index 6275d69aa9cc..c91981e75ebf 100644 --- a/drivers/gpu/drm/i915/gt/selftest_rps.c +++ b/drivers/gpu/drm/i915/gt/selftest_rps.c @@ -20,24 +20,20 @@ /* Try to isolate the impact of cstates from determing frequency response */ #define CPU_LATENCY 0 /* -1 to disable pm_qos, 0 to disable cstates */ -static unsigned long engine_heartbeat_disable(struct intel_engine_cs *engine) +static void engine_heartbeat_disable(struct intel_engine_cs *engine) { - unsigned long old; - - old = fetch_and_zero(&engine->props.heartbeat_interval_ms); + engine->props.heartbeat_interval_ms = 0; intel_engine_pm_get(engine); intel_engine_park_heartbeat(engine); - - return old; } -static void engine_heartbeat_enable(struct intel_engine_cs *engine, - unsigned long saved) +static void engine_heartbeat_enable(struct intel_engine_cs *engine) { intel_engine_pm_put(engine); - engine->props.heartbeat_interval_ms = saved; + engine->props.heartbeat_interval_ms = + engine->defaults.heartbeat_interval_ms; } static void dummy_rps_work(struct work_struct *wrk) @@ -48,9 +44,9 @@ static int cmp_u64(const void *A, const void *B) { const u64 *a = A, *b = B; - if (a < b) + if (*a < *b) return -1; - else if (a > b) + else if (*a > *b) return 1; else return 0; @@ -60,9 +56,9 @@ static int cmp_u32(const void *A, const void *B) { const u32 *a = A, *b = B; - if (a < b) + if (*a < *b) return -1; - else if (a > b) + else if (*a > *b) return 1; else return 0; @@ -246,7 +242,6 @@ int live_rps_clock_interval(void *arg) intel_gt_check_clock_frequency(gt); for_each_engine(engine, gt, id) { - unsigned long saved_heartbeat; struct i915_request *rq; u32 cycles; u64 dt; @@ -254,13 +249,13 @@ int live_rps_clock_interval(void *arg) if (!intel_engine_can_store_dword(engine)) continue; - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); rq = igt_spinner_create_request(&spin, engine->kernel_context, MI_NOOP); if (IS_ERR(rq)) { - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); err = PTR_ERR(rq); break; } @@ -271,7 +266,7 @@ int live_rps_clock_interval(void *arg) pr_err("%s: RPS spinner did not start\n", engine->name); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); intel_gt_set_wedged(engine->gt); err = -EIO; break; @@ -327,7 +322,7 @@ int live_rps_clock_interval(void *arg) intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); if (err == 0) { u64 time = intel_gt_pm_interval_to_ns(gt, cycles); @@ -405,7 +400,6 @@ int live_rps_control(void *arg) intel_gt_pm_get(gt); for_each_engine(engine, gt, id) { - unsigned long saved_heartbeat; struct i915_request *rq; ktime_t min_dt, max_dt; int f, limit; @@ -414,7 +408,7 @@ int live_rps_control(void *arg) if (!intel_engine_can_store_dword(engine)) continue; - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); rq = igt_spinner_create_request(&spin, engine->kernel_context, @@ -430,7 +424,7 @@ int live_rps_control(void *arg) pr_err("%s: RPS spinner did not start\n", engine->name); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); intel_gt_set_wedged(engine->gt); err = -EIO; break; @@ -440,7 +434,7 @@ int live_rps_control(void *arg) pr_err("%s: could not set minimum frequency [%x], only %x!\n", engine->name, rps->min_freq, read_cagf(rps)); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); show_pstate_limits(rps); err = -EINVAL; break; @@ -457,7 +451,7 @@ int live_rps_control(void *arg) pr_err("%s: could not restore minimum frequency [%x], only %x!\n", engine->name, rps->min_freq, read_cagf(rps)); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); show_pstate_limits(rps); err = -EINVAL; break; @@ -472,7 +466,7 @@ int live_rps_control(void *arg) min_dt = ktime_sub(ktime_get(), min_dt); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); pr_info("%s: range:[%x:%uMHz, %x:%uMHz] limit:[%x:%uMHz], %x:%x response %lluns:%lluns\n", engine->name, @@ -635,7 +629,6 @@ int live_rps_frequency_cs(void *arg) rps->work.func = dummy_rps_work; for_each_engine(engine, gt, id) { - unsigned long saved_heartbeat; struct i915_request *rq; struct i915_vma *vma; u32 *cancel, *cntr; @@ -644,14 +637,14 @@ int live_rps_frequency_cs(void *arg) int freq; } min, max; - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); vma = create_spin_counter(engine, engine->kernel_context->vm, false, &cancel, &cntr); if (IS_ERR(vma)) { err = PTR_ERR(vma); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); break; } @@ -732,7 +725,7 @@ err_vma: i915_vma_unpin(vma); i915_vma_put(vma); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); if (igt_flush_test(gt->i915)) err = -EIO; if (err) @@ -778,7 +771,6 @@ int live_rps_frequency_srm(void *arg) rps->work.func = dummy_rps_work; for_each_engine(engine, gt, id) { - unsigned long saved_heartbeat; struct i915_request *rq; struct i915_vma *vma; u32 *cancel, *cntr; @@ -787,14 +779,14 @@ int live_rps_frequency_srm(void *arg) int freq; } min, max; - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); vma = create_spin_counter(engine, engine->kernel_context->vm, true, &cancel, &cntr); if (IS_ERR(vma)) { err = PTR_ERR(vma); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); break; } @@ -874,7 +866,7 @@ err_vma: i915_vma_unpin(vma); i915_vma_put(vma); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); if (igt_flush_test(gt->i915)) err = -EIO; if (err) @@ -1066,16 +1058,14 @@ int live_rps_interrupt(void *arg) for_each_engine(engine, gt, id) { /* Keep the engine busy with a spinner; expect an UP! */ if (pm_events & GEN6_PM_RP_UP_THRESHOLD) { - unsigned long saved_heartbeat; - intel_gt_pm_wait_for_idle(engine->gt); GEM_BUG_ON(intel_rps_is_active(rps)); - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); err = __rps_up_interrupt(rps, engine, &spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); if (err) goto out; @@ -1084,15 +1074,13 @@ int live_rps_interrupt(void *arg) /* Keep the engine awake but idle and check for DOWN */ if (pm_events & GEN6_PM_RP_DOWN_THRESHOLD) { - unsigned long saved_heartbeat; - - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); intel_rc6_disable(>->rc6); err = __rps_down_interrupt(rps, engine); intel_rc6_enable(>->rc6); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); if (err) goto out; } @@ -1168,7 +1156,6 @@ int live_rps_power(void *arg) rps->work.func = dummy_rps_work; for_each_engine(engine, gt, id) { - unsigned long saved_heartbeat; struct i915_request *rq; struct { u64 power; @@ -1178,13 +1165,13 @@ int live_rps_power(void *arg) if (!intel_engine_can_store_dword(engine)) continue; - saved_heartbeat = engine_heartbeat_disable(engine); + engine_heartbeat_disable(engine); rq = igt_spinner_create_request(&spin, engine->kernel_context, MI_NOOP); if (IS_ERR(rq)) { - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); err = PTR_ERR(rq); break; } @@ -1195,7 +1182,7 @@ int live_rps_power(void *arg) pr_err("%s: RPS spinner did not start\n", engine->name); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); intel_gt_set_wedged(engine->gt); err = -EIO; break; @@ -1208,7 +1195,7 @@ int live_rps_power(void *arg) min.power = measure_power_at(rps, &min.freq); igt_spinner_end(&spin); - engine_heartbeat_enable(engine, saved_heartbeat); + engine_heartbeat_enable(engine); pr_info("%s: min:%llumW @ %uMHz, max:%llumW @ %uMHz\n", engine->name, diff --git a/drivers/gpu/drm/i915/gt/selftest_timeline.c b/drivers/gpu/drm/i915/gt/selftest_timeline.c index c2578a0f2f14..ef1c35073dc0 100644 --- a/drivers/gpu/drm/i915/gt/selftest_timeline.c +++ b/drivers/gpu/drm/i915/gt/selftest_timeline.c @@ -751,22 +751,20 @@ out_free: return err; } -static void engine_heartbeat_disable(struct intel_engine_cs *engine, - unsigned long *saved) +static void engine_heartbeat_disable(struct intel_engine_cs *engine) { - *saved = engine->props.heartbeat_interval_ms; engine->props.heartbeat_interval_ms = 0; intel_engine_pm_get(engine); intel_engine_park_heartbeat(engine); } -static void engine_heartbeat_enable(struct intel_engine_cs *engine, - unsigned long saved) +static void engine_heartbeat_enable(struct intel_engine_cs *engine) { intel_engine_pm_put(engine); - engine->props.heartbeat_interval_ms = saved; + engine->props.heartbeat_interval_ms = + engine->defaults.heartbeat_interval_ms; } static int live_hwsp_rollover_kernel(void *arg) @@ -785,10 +783,9 @@ static int live_hwsp_rollover_kernel(void *arg) struct intel_context *ce = engine->kernel_context; struct intel_timeline *tl = ce->timeline; struct i915_request *rq[3] = {}; - unsigned long heartbeat; int i; - engine_heartbeat_disable(engine, &heartbeat); + engine_heartbeat_disable(engine); if (intel_gt_wait_for_idle(gt, HZ / 2)) { err = -EIO; goto out; @@ -839,7 +836,7 @@ static int live_hwsp_rollover_kernel(void *arg) out: for (i = 0; i < ARRAY_SIZE(rq); i++) i915_request_put(rq[i]); - engine_heartbeat_enable(engine, heartbeat); + engine_heartbeat_enable(engine); if (err) break; } diff --git a/drivers/gpu/drm/i915/gt/selftest_workarounds.c b/drivers/gpu/drm/i915/gt/selftest_workarounds.c index 5ed323254ee1..32785463ec9e 100644 --- a/drivers/gpu/drm/i915/gt/selftest_workarounds.c +++ b/drivers/gpu/drm/i915/gt/selftest_workarounds.c @@ -623,6 +623,8 @@ err_request: err = -EINVAL; goto out_unpin; } + } else { + rsvd = 0; } expect = results[0]; diff --git a/drivers/gpu/drm/i915/gt/shaders/README b/drivers/gpu/drm/i915/gt/shaders/README new file mode 100644 index 000000000000..e7e96d7073c7 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/README @@ -0,0 +1,46 @@ +ASM sources for auto generated shaders +====================================== + +The i915/gt/hsw_clear_kernel.c and i915/gt/ivb_clear_kernel.c files contain +pre-compiled batch chunks that will clear any residual render cache during +context switch. + +They are generated from their respective platform ASM files present on +i915/gt/shaders/clear_kernel directory. + +The generated .c files should never be modified directly. Instead, any modification +needs to be done on the on their respective ASM files and build instructions below +needes to be followed. + +Building +======== + +Environment +----------- + +IGT GPU tool scripts and the Mesa's i965 instruction assembler tool are used +on building. + +Please make sure your Mesa tool is compiled with "-Dtools=intel" and +"-Ddri-drivers=i965", and run this script from IGT source root directory" + +The instructions bellow assume: + * IGT gpu tools source code is located on your home directory (~) as ~/igt + * Mesa source code is located on your home directory (~) as ~/mesa + and built under the ~/mesa/build directory + * Linux kernel source code is under your home directory (~) as ~/linux + +Instructions +------------ + +~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm \ + ~/igt/lib/i915/shaders/clear_kernel/ivb.asm +~ $ cd ~/igt +igt $ ./scripts/generate_clear_kernel.sh -g ivb \ + -m ~/mesa/build/src/intel/tools/i965_asm + +~ $ cp ~/linux/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm \ + ~/igt/lib/i915/shaders/clear_kernel/hsw.asm +~ $ cd ~/igt +igt $ ./scripts/generate_clear_kernel.sh -g hsw \ + -m ~/mesa/build/src/intel/tools/i965_asm
\ No newline at end of file diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm new file mode 100644 index 000000000000..5fdf384bb621 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/hsw.asm @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +/* + * Kernel for PAVP buffer clear. + * + * 1. Clear all 64 GRF registers assigned to the kernel with designated value; + * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears + * 512 bytes of Render Cache. + */ + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/** + * Curbe Format + * + * DW 1.0 - Block Offset to write Render Cache + * DW 1.1 [15:0] - Clear Word + * DW 1.2 - Delay iterations + * DW 1.3 - Enable Instrumentation (only for debug) + * DW 1.4 - Rsvd (intended for context ID) + * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount + * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count) + * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count) + * + * Binding Table + * + * BTI 0: 2D Surface to help clear L3 (Render/Data Cache) + * BTI 1: Wait/Instrumentation Buffer + * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT) + * Expected to be initialized to 0 by driver/another kernel + * Layout: + * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS] + * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N + */ +add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */ +cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N }; +(+f0.0) jmpi(1) 352D { align1 WE_all 1N }; + +/** + * State Register has info on where this thread is running + * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + */ +mov(8) g3<1>UD 0x00000000UD { align1 1Q }; +shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N }; +and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */ +shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N }; +and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */ +mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N }; +add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */ +shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N }; +and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */ +mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N }; +add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */ + +mov(8) g5<1>UD 0x00000000UD { align1 1Q }; +and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N }; +mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N }; + +mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */ +mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */ +and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N }; + +/* Media block read to fetch current value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x02190001 + + render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q }; +add(1) g5<1>D g5<0,1,0>D 1D { align1 1N }; + +/* Media block write for updated value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q }; + +/* Delay thread for specified parameter */ +add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N }; +(+f0.0) jmpi(1) -32D { align1 WE_all 1N }; + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/* Initialize looping parameters */ +mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */ +mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */ + +/* Write 32x16 all "0" block */ +mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q }; +mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q }; +mov(2) g2<1>UD g1<2,2,1>UW { align1 1N }; +mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */ +and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N }; +mov(16) g3<1>UD 0x00000000UD { align1 1H }; +mov(16) g4<1>UD 0x00000000UD { align1 1H }; +mov(16) g5<1>UD 0x00000000UD { align1 1H }; +mov(16) g6<1>UD 0x00000000UD { align1 1H }; +mov(16) g7<1>UD 0x00000000UD { align1 1H }; +mov(16) g8<1>UD 0x00000000UD { align1 1H }; +mov(16) g9<1>UD 0x00000000UD { align1 1H }; +mov(16) g10<1>UD 0x00000000UD { align1 1H }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; +add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; + +/* Now, clear all GRF registers */ +add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N }; +mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H }; +add(1) a0<1>D a0<0,1,0>D 32D { align1 1N }; +(+f0.0) jmpi(1) -64D { align1 WE_all 1N }; + +/* Terminante the thread */ +sendc(8) null<1>UD g127<8,8,1>F 0x82000010 + thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT }; diff --git a/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm new file mode 100644 index 000000000000..97c7ac9e3854 --- /dev/null +++ b/drivers/gpu/drm/i915/gt/shaders/clear_kernel/ivb.asm @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2020 Intel Corporation + */ + +/* + * Kernel for PAVP buffer clear. + * + * 1. Clear all 64 GRF registers assigned to the kernel with designated value; + * 2. Write 32x16 block of all "0" to render target buffer which indirectly clears + * 512 bytes of Render Cache. + */ + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/** + * Curbe Format + * + * DW 1.0 - Block Offset to write Render Cache + * DW 1.1 [15:0] - Clear Word + * DW 1.2 - Delay iterations + * DW 1.3 - Enable Instrumentation (only for debug) + * DW 1.4 - Rsvd (intended for context ID) + * DW 1.5 - [31:16]:SliceCount, [15:0]:SubSlicePerSliceCount + * DW 1.6 - Rsvd MBZ (intended for Enable Wait on Total Thread Count) + * DW 1.7 - Rsvd MBZ (inteded for Total Thread Count) + * + * Binding Table + * + * BTI 0: 2D Surface to help clear L3 (Render/Data Cache) + * BTI 1: Wait/Instrumentation Buffer + * Size : (SliceCount * SubSliceCount * 16 EUs/SubSlice) rows * (16 threads/EU) cols (Format R32_UINT) + * Expected to be initialized to 0 by driver/another kernel + * Layout : + * RowN: Histogram for EU-N: (SliceID*SubSlicePerSliceCount + SSID)*16 + EUID [assume max 16 EUs / SS] + * Col-k[DW-k]: Threads Executed on ThreadID-k for EU-N + */ +add(1) g1.2<1>UD g1.2<0,1,0>UD 0x00000001UD { align1 1N }; /* Loop count to delay kernel: Init to (g1.2 + 1) */ +cmp.z.f0.0(1) null<1>UD g1.3<0,1,0>UD 0x00000000UD { align1 1N }; +(+f0.0) jmpi(1) 44D { align1 WE_all 1N }; + +/** + * State Register has info on where this thread is running + * IVB: sr0.0 :: [15:13]: MBZ, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + * HSW: sr0.0 :: 15: MBZ, [14:13]: SliceID, 12: HSID (Half-Slice ID), [11:8]EUID, [2:0] ThreadSlotID + */ +mov(8) g3<1>UD 0x00000000UD { align1 1Q }; +shr(1) g3<1>D sr0<0,1,0>D 12D { align1 1N }; +and(1) g3<1>D g3<0,1,0>D 1D { align1 1N }; /* g3 has HSID */ +shr(1) g3.1<1>D sr0<0,1,0>D 13D { align1 1N }; +and(1) g3.1<1>D g3.1<0,1,0>D 3D { align1 1N }; /* g3.1 has sliceID */ +mul(1) g3.5<1>D g3.1<0,1,0>D g1.10<0,1,0>UW { align1 1N }; +add(1) g3<1>D g3<0,1,0>D g3.5<0,1,0>D { align1 1N }; /* g3 = sliceID * SubSlicePerSliceCount + HSID */ +shr(1) g3.2<1>D sr0<0,1,0>D 8D { align1 1N }; +and(1) g3.2<1>D g3.2<0,1,0>D 15D { align1 1N }; /* g3.2 = EUID */ +mul(1) g3.4<1>D g3<0,1,0>D 16D { align1 1N }; +add(1) g3.2<1>D g3.2<0,1,0>D g3.4<0,1,0>D { align1 1N }; /* g3.2 now points to EU row number (Y-pixel = V address ) in instrumentation surf */ + +mov(8) g5<1>UD 0x00000000UD { align1 1Q }; +and(1) g3.3<1>D sr0<0,1,0>D 7D { align1 1N }; +mul(1) g3.3<1>D g3.3<0,1,0>D 4D { align1 1N }; + +mov(8) g4<1>UD g0<8,8,1>UD { align1 1Q }; /* Initialize message header with g0 */ +mov(1) g4<1>UD g3.3<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.1<1>UD g3.2<0,1,0>UD { align1 1N }; /* Block offset */ +mov(1) g4.2<1>UD 0x00000003UD { align1 1N }; /* Block size (1 row x 4 bytes) */ +and(1) g4.3<1>UD g4.3<0,1,0>UW 0xffffffffUD { align1 1N }; + +/* Media block read to fetch current value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x02190001 + render MsgDesc: media block read MsgCtrl = 0x0 Surface = 1 mlen 1 rlen 1 { align1 1Q }; +add(1) g5<1>D g5<0,1,0>D 1D { align1 1N }; + +/* Media block write for updated value at specified location in instrumentation buffer */ +sendc(8) g5<1>UD g4<8,8,1>F 0x040a8001 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 1 mlen 2 rlen 0 { align1 1Q }; +/* Delay thread for specified parameter */ +add.nz.f0.0(1) g1.2<1>UD g1.2<0,1,0>UD -1D { align1 1N }; +(+f0.0) jmpi(1) -4D { align1 WE_all 1N }; + +/* Store designated "clear GRF" value */ +mov(1) f0.1<1>UW g1.2<0,1,0>UW { align1 1N }; + +/* Initialize looping parameters */ +mov(1) a0<1>D 0D { align1 1N }; /* Initialize a0.0:w=0 */ +mov(1) a0.4<1>W 127W { align1 1N }; /* Loop count. Each loop contains 16 GRF's */ + +/* Write 32x16 all "0" block */ +mov(8) g2<1>UD g0<8,8,1>UD { align1 1Q }; +mov(8) g127<1>UD g0<8,8,1>UD { align1 1Q }; +mov(2) g2<1>UD g1<2,2,1>UW { align1 1N }; +mov(1) g2.2<1>UD 0x000f000fUD { align1 1N }; /* Block size (16x16) */ +and(1) g2.3<1>UD g2.3<0,1,0>UW 0xffffffefUD { align1 1N }; +mov(16) g3<1>UD 0x00000000UD { align1 1H }; +mov(16) g4<1>UD 0x00000000UD { align1 1H }; +mov(16) g5<1>UD 0x00000000UD { align1 1H }; +mov(16) g6<1>UD 0x00000000UD { align1 1H }; +mov(16) g7<1>UD 0x00000000UD { align1 1H }; +mov(16) g8<1>UD 0x00000000UD { align1 1H }; +mov(16) g9<1>UD 0x00000000UD { align1 1H }; +mov(16) g10<1>UD 0x00000000UD { align1 1H }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; +add(1) g2<1>UD g1<0,1,0>UW 0x0010UW { align1 1N }; +sendc(8) null<1>UD g2<8,8,1>F 0x120a8000 + render MsgDesc: media block write MsgCtrl = 0x0 Surface = 0 mlen 9 rlen 0 { align1 1Q }; + +/* Now, clear all GRF registers */ +add.nz.f0.0(1) a0.4<1>W a0.4<0,1,0>W -1W { align1 1N }; +mov(16) g[a0]<1>UW f0.1<0,1,0>UW { align1 1H }; +add(1) a0<1>D a0<0,1,0>D 32D { align1 1N }; +(+f0.0) jmpi(1) -8D { align1 WE_all 1N }; + +/* Terminante the thread */ +sendc(8) null<1>UD g127<8,8,1>F 0x82000010 + thread_spawner MsgDesc: mlen 1 rlen 0 { align1 1Q EOT }; diff --git a/drivers/gpu/drm/i915/gvt/debugfs.c b/drivers/gpu/drm/i915/gvt/debugfs.c index ec47d4114554..62e6a14ad58e 100644 --- a/drivers/gpu/drm/i915/gvt/debugfs.c +++ b/drivers/gpu/drm/i915/gvt/debugfs.c @@ -66,7 +66,7 @@ static inline int mmio_diff_handler(struct intel_gvt *gvt, vreg = vgpu_vreg(param->vgpu, offset); if (preg != vreg) { - node = kmalloc(sizeof(*node), GFP_KERNEL); + node = kmalloc(sizeof(*node), GFP_ATOMIC); if (!node) return -ENOMEM; diff --git a/drivers/gpu/drm/i915/gvt/handlers.c b/drivers/gpu/drm/i915/gvt/handlers.c index 3e88e3b5c43a..fadd2adb8030 100644 --- a/drivers/gpu/drm/i915/gvt/handlers.c +++ b/drivers/gpu/drm/i915/gvt/handlers.c @@ -1726,13 +1726,13 @@ static int ring_mode_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, (*(u32 *)p_data) &= ~_MASKED_BIT_ENABLE(2); write_vreg(vgpu, offset, p_data, bytes); - if (data & _MASKED_BIT_ENABLE(1)) { + if (IS_MASKED_BITS_ENABLED(data, 1)) { enter_failsafe_mode(vgpu, GVT_FAILSAFE_UNSUPPORTED_GUEST); return 0; } if (IS_COFFEELAKE(vgpu->gvt->gt->i915) && - data & _MASKED_BIT_ENABLE(2)) { + IS_MASKED_BITS_ENABLED(data, 2)) { enter_failsafe_mode(vgpu, GVT_FAILSAFE_UNSUPPORTED_GUEST); return 0; } @@ -1741,14 +1741,14 @@ static int ring_mode_mmio_write(struct intel_vgpu *vgpu, unsigned int offset, * pvinfo, if not, we will treat this guest as non-gvtg-aware * guest, and stop emulating its cfg space, mmio, gtt, etc. */ - if (((data & _MASKED_BIT_ENABLE(GFX_PPGTT_ENABLE)) || - (data & _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE))) - && !vgpu->pv_notified) { + if ((IS_MASKED_BITS_ENABLED(data, GFX_PPGTT_ENABLE) || + IS_MASKED_BITS_ENABLED(data, GFX_RUN_LIST_ENABLE)) && + !vgpu->pv_notified) { enter_failsafe_mode(vgpu, GVT_FAILSAFE_UNSUPPORTED_GUEST); return 0; } - if ((data & _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE)) - || (data & _MASKED_BIT_DISABLE(GFX_RUN_LIST_ENABLE))) { + if (IS_MASKED_BITS_ENABLED(data, GFX_RUN_LIST_ENABLE) || + IS_MASKED_BITS_DISABLED(data, GFX_RUN_LIST_ENABLE)) { enable_execlist = !!(data & GFX_RUN_LIST_ENABLE); gvt_dbg_core("EXECLIST %s on ring %s\n", @@ -1809,7 +1809,7 @@ static int ring_reset_ctl_write(struct intel_vgpu *vgpu, write_vreg(vgpu, offset, p_data, bytes); data = vgpu_vreg(vgpu, offset); - if (data & _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET)) + if (IS_MASKED_BITS_ENABLED(data, RESET_CTL_REQUEST_RESET)) data |= RESET_CTL_READY_TO_RESET; else if (data & _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET)) data &= ~RESET_CTL_READY_TO_RESET; @@ -1827,7 +1827,8 @@ static int csfe_chicken1_mmio_write(struct intel_vgpu *vgpu, (*(u32 *)p_data) &= ~_MASKED_BIT_ENABLE(0x18); write_vreg(vgpu, offset, p_data, bytes); - if (data & _MASKED_BIT_ENABLE(0x10) || data & _MASKED_BIT_ENABLE(0x8)) + if (IS_MASKED_BITS_ENABLED(data, 0x10) || + IS_MASKED_BITS_ENABLED(data, 0x8)) enter_failsafe_mode(vgpu, GVT_FAILSAFE_UNSUPPORTED_GUEST); return 0; @@ -3055,6 +3056,7 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) MMIO_D(_MMIO(0x72380), D_SKL_PLUS); MMIO_D(_MMIO(0x7239c), D_SKL_PLUS); MMIO_D(_MMIO(_PLANE_SURF_3_A), D_SKL_PLUS); + MMIO_D(_MMIO(_PLANE_SURF_3_B), D_SKL_PLUS); MMIO_D(CSR_SSP_BASE, D_SKL_PLUS); MMIO_D(CSR_HTP_SKL, D_SKL_PLUS); @@ -3131,8 +3133,8 @@ static int init_skl_mmio_info(struct intel_gvt *gvt) MMIO_DFH(GEN9_WM_CHICKEN3, D_SKL_PLUS, F_MODE_MASK | F_CMD_ACCESS, NULL, NULL); - MMIO_D(GAMT_CHKN_BIT_REG, D_KBL); - MMIO_D(GEN9_CTX_PREEMPT_REG, D_KBL | D_SKL); + MMIO_D(GAMT_CHKN_BIT_REG, D_KBL | D_CFL); + MMIO_D(GEN9_CTX_PREEMPT_REG, D_SKL_PLUS); return 0; } diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.h b/drivers/gpu/drm/i915/gvt/mmio_context.h index 970704b18f23..3b25e7fe32f6 100644 --- a/drivers/gpu/drm/i915/gvt/mmio_context.h +++ b/drivers/gpu/drm/i915/gvt/mmio_context.h @@ -54,8 +54,8 @@ bool is_inhibit_context(struct intel_context *ce); int intel_vgpu_restore_inhibit_context(struct intel_vgpu *vgpu, struct i915_request *req); -#define IS_RESTORE_INHIBIT(a) \ - (_MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) == \ - ((a) & _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT))) + +#define IS_RESTORE_INHIBIT(a) \ + IS_MASKED_BITS_ENABLED(a, CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) #endif diff --git a/drivers/gpu/drm/i915/gvt/reg.h b/drivers/gpu/drm/i915/gvt/reg.h index 5b66e14c5b7b..b88e033cbed4 100644 --- a/drivers/gpu/drm/i915/gvt/reg.h +++ b/drivers/gpu/drm/i915/gvt/reg.h @@ -94,6 +94,11 @@ #define GFX_MODE_BIT_SET_IN_MASK(val, bit) \ ((((bit) & 0xffff0000) == 0) && !!((val) & (((bit) << 16)))) +#define IS_MASKED_BITS_ENABLED(_val, _b) \ + (((_val) & _MASKED_BIT_ENABLE(_b)) == _MASKED_BIT_ENABLE(_b)) +#define IS_MASKED_BITS_DISABLED(_val, _b) \ + ((_val) & _MASKED_BIT_DISABLE(_b)) + #define FORCEWAKE_RENDER_GEN9_REG 0xa278 #define FORCEWAKE_ACK_RENDER_GEN9_REG 0x0D84 #define FORCEWAKE_BLITTER_GEN9_REG 0xa188 diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index bca036ac6621..e7532e7d74e9 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -230,7 +230,7 @@ static int per_file_stats(int id, void *ptr, void *data) struct file_stats *stats = data; struct i915_vma *vma; - if (!kref_get_unless_zero(&obj->base.refcount)) + if (IS_ERR_OR_NULL(obj) || !kref_get_unless_zero(&obj->base.refcount)) return 0; stats->count++; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index adb9bf34cf97..ae99a9190200 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -410,8 +410,6 @@ struct intel_fbc { int adjusted_x; int adjusted_y; - int y; - u16 pixel_blend_mode; } plane; @@ -420,6 +418,8 @@ struct intel_fbc { unsigned int stride; u64 modifier; } fb; + + unsigned int fence_y_offset; u16 gen9_wa_cfb_stride; s8 fence_id; } state_cache; @@ -435,15 +435,16 @@ struct intel_fbc { struct { enum pipe pipe; enum i9xx_plane_id i9xx_plane; - unsigned int fence_y_offset; } crtc; struct { const struct drm_format_info *format; unsigned int stride; + u64 modifier; } fb; int cfb_size; + unsigned int fence_y_offset; u16 gen9_wa_cfb_stride; s8 fence_id; bool plane_visible; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 4dc601dffc08..284cf078135a 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -3125,6 +3125,7 @@ static void gen11_hpd_irq_setup(struct drm_i915_private *dev_priv) val = I915_READ(GEN11_DE_HPD_IMR); val &= ~hotplug_irqs; + val |= ~enabled_irqs & hotplug_irqs; I915_WRITE(GEN11_DE_HPD_IMR, val); POSTING_READ(GEN11_DE_HPD_IMR); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 25329b7600c9..014f34c047d5 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -1592,6 +1592,7 @@ static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs, u32 d; cmd = save ? MI_STORE_REGISTER_MEM : MI_LOAD_REGISTER_MEM; + cmd |= MI_SRM_LRM_GLOBAL_GTT; if (INTEL_GEN(stream->perf->i915) >= 8) cmd++; diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index e991a707bdb7..962ded9ce73f 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -269,12 +269,48 @@ static bool exclusive_mmio_access(const struct drm_i915_private *i915) return IS_GEN(i915, 7); } +static void engine_sample(struct intel_engine_cs *engine, unsigned int period_ns) +{ + struct intel_engine_pmu *pmu = &engine->pmu; + bool busy; + u32 val; + + val = ENGINE_READ_FW(engine, RING_CTL); + if (val == 0) /* powerwell off => engine idle */ + return; + + if (val & RING_WAIT) + add_sample(&pmu->sample[I915_SAMPLE_WAIT], period_ns); + if (val & RING_WAIT_SEMAPHORE) + add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); + + /* No need to sample when busy stats are supported. */ + if (intel_engine_supports_stats(engine)) + return; + + /* + * While waiting on a semaphore or event, MI_MODE reports the + * ring as idle. However, previously using the seqno, and with + * execlists sampling, we account for the ring waiting as the + * engine being busy. Therefore, we record the sample as being + * busy if either waiting or !idle. + */ + busy = val & (RING_WAIT_SEMAPHORE | RING_WAIT); + if (!busy) { + val = ENGINE_READ_FW(engine, RING_MI_MODE); + busy = !(val & MODE_IDLE); + } + if (busy) + add_sample(&pmu->sample[I915_SAMPLE_BUSY], period_ns); +} + static void engines_sample(struct intel_gt *gt, unsigned int period_ns) { struct drm_i915_private *i915 = gt->i915; struct intel_engine_cs *engine; enum intel_engine_id id; + unsigned long flags; if ((i915->pmu.enable & ENGINE_SAMPLE_MASK) == 0) return; @@ -283,53 +319,17 @@ engines_sample(struct intel_gt *gt, unsigned int period_ns) return; for_each_engine(engine, gt, id) { - struct intel_engine_pmu *pmu = &engine->pmu; - spinlock_t *mmio_lock; - unsigned long flags; - bool busy; - u32 val; - if (!intel_engine_pm_get_if_awake(engine)) continue; - mmio_lock = NULL; - if (exclusive_mmio_access(i915)) - mmio_lock = &engine->uncore->lock; - - if (unlikely(mmio_lock)) - spin_lock_irqsave(mmio_lock, flags); - - val = ENGINE_READ_FW(engine, RING_CTL); - if (val == 0) /* powerwell off => engine idle */ - goto skip; - - if (val & RING_WAIT) - add_sample(&pmu->sample[I915_SAMPLE_WAIT], period_ns); - if (val & RING_WAIT_SEMAPHORE) - add_sample(&pmu->sample[I915_SAMPLE_SEMA], period_ns); - - /* No need to sample when busy stats are supported. */ - if (intel_engine_supports_stats(engine)) - goto skip; - - /* - * While waiting on a semaphore or event, MI_MODE reports the - * ring as idle. However, previously using the seqno, and with - * execlists sampling, we account for the ring waiting as the - * engine being busy. Therefore, we record the sample as being - * busy if either waiting or !idle. - */ - busy = val & (RING_WAIT_SEMAPHORE | RING_WAIT); - if (!busy) { - val = ENGINE_READ_FW(engine, RING_MI_MODE); - busy = !(val & MODE_IDLE); + if (exclusive_mmio_access(i915)) { + spin_lock_irqsave(&engine->uncore->lock, flags); + engine_sample(engine, period_ns); + spin_unlock_irqrestore(&engine->uncore->lock, flags); + } else { + engine_sample(engine, period_ns); } - if (busy) - add_sample(&pmu->sample[I915_SAMPLE_BUSY], period_ns); -skip: - if (unlikely(mmio_lock)) - spin_unlock_irqrestore(mmio_lock, flags); intel_engine_pm_put_async(engine); } } diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h index 5003a71113cb..8aa7866ec6b6 100644 --- a/drivers/gpu/drm/i915/i915_priolist_types.h +++ b/drivers/gpu/drm/i915/i915_priolist_types.h @@ -42,7 +42,7 @@ enum { * active request. */ #define I915_PRIORITY_UNPREEMPTABLE INT_MAX -#define I915_PRIORITY_BARRIER INT_MAX +#define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1) struct i915_priolist { struct list_head requests[I915_PRIORITY_COUNT]; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 7717581350bd..06cd1d28a176 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -7896,7 +7896,7 @@ enum { /* GEN7 chicken */ #define GEN7_COMMON_SLICE_CHICKEN1 _MMIO(0x7010) - #define GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC ((1 << 10) | (1 << 26)) + #define GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC (1 << 10) #define GEN9_RHWO_OPTIMIZATION_DISABLE (1 << 14) #define COMMON_SLICE_CHICKEN2 _MMIO(0x7014) diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c index fc14ebf9a0b7..1f9cd33b35cb 100644 --- a/drivers/gpu/drm/i915/i915_vma.c +++ b/drivers/gpu/drm/i915/i915_vma.c @@ -104,6 +104,7 @@ vma_create(struct drm_i915_gem_object *obj, struct i915_address_space *vm, const struct i915_ggtt_view *view) { + struct i915_vma *pos = ERR_PTR(-E2BIG); struct i915_vma *vma; struct rb_node *rb, **p; @@ -184,7 +185,6 @@ vma_create(struct drm_i915_gem_object *obj, rb = NULL; p = &obj->vma.tree.rb_node; while (*p) { - struct i915_vma *pos; long cmp; rb = *p; @@ -196,16 +196,12 @@ vma_create(struct drm_i915_gem_object *obj, * and dispose of ours. */ cmp = i915_vma_compare(pos, vm, view); - if (cmp == 0) { - spin_unlock(&obj->vma.lock); - i915_vma_free(vma); - return pos; - } - if (cmp < 0) p = &rb->rb_right; - else + else if (cmp > 0) p = &rb->rb_left; + else + goto err_unlock; } rb_link_node(&vma->obj_node, rb, p); rb_insert_color(&vma->obj_node, &obj->vma.tree); @@ -228,8 +224,9 @@ vma_create(struct drm_i915_gem_object *obj, err_unlock: spin_unlock(&obj->vma.lock); err_vma: + i915_vm_put(vm); i915_vma_free(vma); - return ERR_PTR(-E2BIG); + return pos; } static struct i915_vma * diff --git a/drivers/gpu/drm/i915/intel_pm.c b/drivers/gpu/drm/i915/intel_pm.c index 696491d71a1d..07f663cd2d1c 100644 --- a/drivers/gpu/drm/i915/intel_pm.c +++ b/drivers/gpu/drm/i915/intel_pm.c @@ -6830,16 +6830,6 @@ static void ilk_init_clock_gating(struct drm_i915_private *dev_priv) I915_WRITE(ILK_DISPLAY_CHICKEN2, I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_ELPIN_409_SELECT); - I915_WRITE(_3D_CHICKEN2, - _3D_CHICKEN2_WM_READ_PIPELINED << 16 | - _3D_CHICKEN2_WM_READ_PIPELINED); - - /* WaDisableRenderCachePipelinedFlush:ilk */ - I915_WRITE(CACHE_MODE_0, - _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE)); - - /* WaDisable_RenderCache_OperationalFlush:ilk */ - I915_WRITE(CACHE_MODE_0, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); g4x_disable_trickle_feed(dev_priv); @@ -6902,27 +6892,6 @@ static void gen6_init_clock_gating(struct drm_i915_private *dev_priv) I915_READ(ILK_DISPLAY_CHICKEN2) | ILK_ELPIN_409_SELECT); - /* WaDisableHiZPlanesWhenMSAAEnabled:snb */ - I915_WRITE(_3D_CHICKEN, - _MASKED_BIT_ENABLE(_3D_CHICKEN_HIZ_PLANE_DISABLE_MSAA_4X_SNB)); - - /* WaDisable_RenderCache_OperationalFlush:snb */ - I915_WRITE(CACHE_MODE_0, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); - - /* - * BSpec recoomends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - I915_WRITE(GEN6_GT_MODE, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4)); - - I915_WRITE(CACHE_MODE_0, - _MASKED_BIT_DISABLE(CM0_STC_EVICT_DISABLE_LRA_SNB)); - I915_WRITE(GEN6_UCGCTL1, I915_READ(GEN6_UCGCTL1) | GEN6_BLBUNIT_CLOCK_GATE_DISABLE | @@ -6945,18 +6914,6 @@ static void gen6_init_clock_gating(struct drm_i915_private *dev_priv) GEN6_RCPBUNIT_CLOCK_GATE_DISABLE | GEN6_RCCUNIT_CLOCK_GATE_DISABLE); - /* WaStripsFansDisableFastClipPerformanceFix:snb */ - I915_WRITE(_3D_CHICKEN3, - _MASKED_BIT_ENABLE(_3D_CHICKEN3_SF_DISABLE_FASTCLIP_CULL)); - - /* - * Bspec says: - * "This bit must be set if 3DSTATE_CLIP clip mode is set to normal and - * 3DSTATE_SF number of SF output attributes is more than 16." - */ - I915_WRITE(_3D_CHICKEN3, - _MASKED_BIT_ENABLE(_3D_CHICKEN3_SF_DISABLE_PIPELINED_ATTR_FETCH)); - /* * According to the spec the following bits should be * set in order to enable memory self-refresh and fbc: @@ -6986,24 +6943,6 @@ static void gen6_init_clock_gating(struct drm_i915_private *dev_priv) gen6_check_mch_setup(dev_priv); } -static void gen7_setup_fixed_func_scheduler(struct drm_i915_private *dev_priv) -{ - u32 reg = I915_READ(GEN7_FF_THREAD_MODE); - - /* - * WaVSThreadDispatchOverride:ivb,vlv - * - * This actually overrides the dispatch - * mode for all thread types. - */ - reg &= ~GEN7_FF_SCHED_MASK; - reg |= GEN7_FF_TS_SCHED_HW; - reg |= GEN7_FF_VS_SCHED_HW; - reg |= GEN7_FF_DS_SCHED_HW; - - I915_WRITE(GEN7_FF_THREAD_MODE, reg); -} - static void lpt_init_clock_gating(struct drm_i915_private *dev_priv) { /* @@ -7230,45 +7169,10 @@ static void bdw_init_clock_gating(struct drm_i915_private *dev_priv) static void hsw_init_clock_gating(struct drm_i915_private *dev_priv) { - /* L3 caching of data atomics doesn't work -- disable it. */ - I915_WRITE(HSW_SCRATCH1, HSW_SCRATCH1_L3_DATA_ATOMICS_DISABLE); - I915_WRITE(HSW_ROW_CHICKEN3, - _MASKED_BIT_ENABLE(HSW_ROW_CHICKEN3_L3_GLOBAL_ATOMICS_DISABLE)); - /* This is required by WaCatErrorRejectionIssue:hsw */ I915_WRITE(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG, - I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | - GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); - - /* WaVSRefCountFullforceMissDisable:hsw */ - I915_WRITE(GEN7_FF_THREAD_MODE, - I915_READ(GEN7_FF_THREAD_MODE) & ~GEN7_FF_VS_REF_CNT_FFME); - - /* WaDisable_RenderCache_OperationalFlush:hsw */ - I915_WRITE(CACHE_MODE_0_GEN7, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); - - /* enable HiZ Raw Stall Optimization */ - I915_WRITE(CACHE_MODE_0_GEN7, - _MASKED_BIT_DISABLE(HIZ_RAW_STALL_OPT_DISABLE)); - - /* WaDisable4x2SubspanOptimization:hsw */ - I915_WRITE(CACHE_MODE_1, - _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - I915_WRITE(GEN7_GT_MODE, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4)); - - /* WaSampleCChickenBitEnable:hsw */ - I915_WRITE(HALF_SLICE_CHICKEN3, - _MASKED_BIT_ENABLE(HSW_SAMPLE_C_PERFORMANCE)); + I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | + GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); /* WaSwitchSolVfFArbitrationPriority:hsw */ I915_WRITE(GAM_ECOCHK, I915_READ(GAM_ECOCHK) | HSW_ECOCHK_ARB_PRIO_SOL); @@ -7282,32 +7186,11 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv) I915_WRITE(ILK_DSPCLK_GATE_D, ILK_VRHUNIT_CLOCK_GATE_DISABLE); - /* WaDisableEarlyCull:ivb */ - I915_WRITE(_3D_CHICKEN3, - _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL)); - /* WaDisableBackToBackFlipFix:ivb */ I915_WRITE(IVB_CHICKEN3, CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE | CHICKEN3_DGMG_DONE_FIX_DISABLE); - /* WaDisablePSDDualDispatchEnable:ivb */ - if (IS_IVB_GT1(dev_priv)) - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, - _MASKED_BIT_ENABLE(GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE)); - - /* WaDisable_RenderCache_OperationalFlush:ivb */ - I915_WRITE(CACHE_MODE_0_GEN7, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); - - /* Apply the WaDisableRHWOOptimizationForRenderHang:ivb workaround. */ - I915_WRITE(GEN7_COMMON_SLICE_CHICKEN1, - GEN7_CSC1_RHWO_OPT_DISABLE_IN_RCC); - - /* WaApplyL3ControlAndL3ChickenMode:ivb */ - I915_WRITE(GEN7_L3CNTLREG1, - GEN7_WA_FOR_GEN7_L3_CONTROL); - I915_WRITE(GEN7_L3_CHICKEN_MODE_REGISTER, - GEN7_WA_L3_CHICKEN_MODE); if (IS_IVB_GT1(dev_priv)) I915_WRITE(GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); @@ -7319,10 +7202,6 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv) _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); } - /* WaForceL3Serialization:ivb */ - I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) & - ~L3SQ_URB_READ_CAM_MATCH_DISABLE); - /* * According to the spec, bit 13 (RCZUNIT) must be set on IVB. * This implements the WaDisableRCZUnitClockGating:ivb workaround. @@ -7337,29 +7216,6 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv) g4x_disable_trickle_feed(dev_priv); - gen7_setup_fixed_func_scheduler(dev_priv); - - if (0) { /* causes HiZ corruption on ivb:gt1 */ - /* enable HiZ Raw Stall Optimization */ - I915_WRITE(CACHE_MODE_0_GEN7, - _MASKED_BIT_DISABLE(HIZ_RAW_STALL_OPT_DISABLE)); - } - - /* WaDisable4x2SubspanOptimization:ivb */ - I915_WRITE(CACHE_MODE_1, - _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - I915_WRITE(GEN7_GT_MODE, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4)); - snpcr = I915_READ(GEN6_MBCUNIT_SNPCR); snpcr &= ~GEN6_MBC_SNPCR_MASK; snpcr |= GEN6_MBC_SNPCR_MED; @@ -7373,28 +7229,11 @@ static void ivb_init_clock_gating(struct drm_i915_private *dev_priv) static void vlv_init_clock_gating(struct drm_i915_private *dev_priv) { - /* WaDisableEarlyCull:vlv */ - I915_WRITE(_3D_CHICKEN3, - _MASKED_BIT_ENABLE(_3D_CHICKEN_SF_DISABLE_OBJEND_CULL)); - /* WaDisableBackToBackFlipFix:vlv */ I915_WRITE(IVB_CHICKEN3, CHICKEN3_DGMG_REQ_OUT_FIX_DISABLE | CHICKEN3_DGMG_DONE_FIX_DISABLE); - /* WaPsdDispatchEnable:vlv */ - /* WaDisablePSDDualDispatchEnable:vlv */ - I915_WRITE(GEN7_HALF_SLICE_CHICKEN1, - _MASKED_BIT_ENABLE(GEN7_MAX_PS_THREAD_DEP | - GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE)); - - /* WaDisable_RenderCache_OperationalFlush:vlv */ - I915_WRITE(CACHE_MODE_0_GEN7, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); - - /* WaForceL3Serialization:vlv */ - I915_WRITE(GEN7_L3SQCREG4, I915_READ(GEN7_L3SQCREG4) & - ~L3SQ_URB_READ_CAM_MATCH_DISABLE); - /* WaDisableDopClockGating:vlv */ I915_WRITE(GEN7_ROW_CHICKEN2, _MASKED_BIT_ENABLE(DOP_CLOCK_GATING_DISABLE)); @@ -7404,8 +7243,6 @@ static void vlv_init_clock_gating(struct drm_i915_private *dev_priv) I915_READ(GEN7_SQ_CHICKEN_MBCUNIT_CONFIG) | GEN7_SQ_CHICKEN_MBCUNIT_SQINTMOB); - gen7_setup_fixed_func_scheduler(dev_priv); - /* * According to the spec, bit 13 (RCZUNIT) must be set on IVB. * This implements the WaDisableRCZUnitClockGating:vlv workaround. @@ -7420,30 +7257,6 @@ static void vlv_init_clock_gating(struct drm_i915_private *dev_priv) I915_READ(GEN7_UCGCTL4) | GEN7_L3BANK2X_CLOCK_GATE_DISABLE); /* - * BSpec says this must be set, even though - * WaDisable4x2SubspanOptimization isn't listed for VLV. - */ - I915_WRITE(CACHE_MODE_1, - _MASKED_BIT_ENABLE(PIXEL_SUBSPAN_COLLECT_OPT_DISABLE)); - - /* - * BSpec recommends 8x4 when MSAA is used, - * however in practice 16x4 seems fastest. - * - * Note that PS/WM thread counts depend on the WIZ hashing - * disable bit, which we don't touch here, but it's good - * to keep in mind (see 3DSTATE_PS and 3DSTATE_WM). - */ - I915_WRITE(GEN7_GT_MODE, - _MASKED_FIELD(GEN6_WIZ_HASHING_MASK, GEN6_WIZ_HASHING_16x4)); - - /* - * WaIncreaseL3CreditsForVLVB0:vlv - * This is the hardware default actually. - */ - I915_WRITE(GEN7_L3SQCREG1, VLV_B0_WA_L3SQCREG1_VALUE); - - /* * WaDisableVLVClockGating_VBIIssue:vlv * Disable clock gating on th GCFG unit to prevent a delay * in the reporting of vblank events. @@ -7495,13 +7308,6 @@ static void g4x_init_clock_gating(struct drm_i915_private *dev_priv) dspclk_gate |= DSSUNIT_CLOCK_GATE_DISABLE; I915_WRITE(DSPCLK_GATE_D, dspclk_gate); - /* WaDisableRenderCachePipelinedFlush */ - I915_WRITE(CACHE_MODE_0, - _MASKED_BIT_ENABLE(CM0_PIPELINED_RENDER_FLUSH_DISABLE)); - - /* WaDisable_RenderCache_OperationalFlush:g4x */ - I915_WRITE(CACHE_MODE_0, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); - g4x_disable_trickle_feed(dev_priv); } @@ -7517,11 +7323,6 @@ static void i965gm_init_clock_gating(struct drm_i915_private *dev_priv) intel_uncore_write(uncore, MI_ARB_STATE, _MASKED_BIT_ENABLE(MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE)); - - /* WaDisable_RenderCache_OperationalFlush:gen4 */ - intel_uncore_write(uncore, - CACHE_MODE_0, - _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); } static void i965g_init_clock_gating(struct drm_i915_private *dev_priv) @@ -7534,9 +7335,6 @@ static void i965g_init_clock_gating(struct drm_i915_private *dev_priv) I915_WRITE(RENCLK_GATE_D2, 0); I915_WRITE(MI_ARB_STATE, _MASKED_BIT_ENABLE(MI_ARB_DISPLAY_TRICKLE_FEED_DISABLE)); - - /* WaDisable_RenderCache_OperationalFlush:gen4 */ - I915_WRITE(CACHE_MODE_0, _MASKED_BIT_DISABLE(RC_OP_FLUSH_ENABLE)); } static void gen3_init_clock_gating(struct drm_i915_private *dev_priv) diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h index 6a2be7d0dd95..6090ce35226b 100644 --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h @@ -21,6 +21,7 @@ selftest(fence, i915_sw_fence_mock_selftests) selftest(scatterlist, scatterlist_mock_selftests) selftest(syncmap, i915_syncmap_mock_selftests) selftest(uncore, intel_uncore_mock_selftests) +selftest(ring, intel_ring_mock_selftests) selftest(engine, intel_engine_cs_mock_selftests) selftest(timelines, intel_timeline_mock_selftests) selftest(requests, i915_request_mock_selftests) |